In [None]:


class TextTranslationService:
    def __init__(self, azure_openai_key: str, azure_openai_endpoint: str,
                 text_analytics_key: str, text_analytics_endpoint: str,
                 translator_key: str, translator_endpoint: str):
        self.openai.api_type = "azure"
        self.openai.api_base = azure_openai_endpoint
        self.openai.api_version = "2023-05-15"
        self.openai.api_key = azure_openai_key

        # self.text_analytics_client = TextAnalyticsClient(
        #     endpoint=text_analytics_endpoint,
        #     credential=AzureKeyCredential(text_analytics_key)
        # )

        # self.translator_client = TranslatorTextClient(
        #     credentials=ApiKeyCredentials({"Ocp-Apim-Subscription-Key": translator_key}),
        #     endpoint=translator_endpoint
        # )

    def identify_input_type(self, content: str) -> str:
        prompt = f"Identify the type of the following content. Possible types are HTML, XML, JSON, or plain text. Content: {content[:500]}"
        response = openai.Completion.create(engine="YOUR_ENGINE_NAME", prompt=prompt, max_tokens=50)
        return response.choices[0].text.strip().lower()

    def extract_translatable_content(self, content: str, input_type: str) -> List[str]:
        prompt = f"Extract the translatable content from the following {input_type} input. Return the content as a Python list of strings. Input: {content}"
        response = openai.Completion.create(engine="YOUR_ENGINE_NAME", prompt=prompt, max_tokens=1000)
        return eval(response.choices[0].text.strip())

    def identify_domain_specific_words(self, content: List[str]) -> List[str]:
        documents = [" ".join(content)]
        response = self.text_analytics_client.recognize_entities(documents)
        domain_specific_words = []
        for result in response:
            if result.is_error:
                continue
            for entity in result.entities:
                if entity.category in ["Organization", "Person", "Location", "Product"]:
                    domain_specific_words.append(entity.text)
        return list(set(domain_specific_words))

    def translate_content(self, content: List[str], target_language: str, domain_specific_words: List[str]) -> List[str]:
        translated_content = []
        for text in content:
            for word in domain_specific_words:
                text = text.replace(word, f"<keep>{word}</keep>")
            
            params = TranslateRequestParams(text=text, to=[target_language])
            response = self.translator_client.translate(params)
            
            translated_text = response[0].translations[0].text
            for word in domain_specific_words:
                translated_text = translated_text.replace(f"<keep>{word}</keep>", word)
            
            translated_content.append(translated_text)
        
        return translated_content

    def reconstruct_output(self, original_content: str, translated_content: List[str], input_type: str) -> str:
        prompt = f"""
        Reconstruct the original {input_type} format using the translated content.
        Original content structure: {original_content}
        Translated content: {translated_content}
        Return the reconstructed {input_type} content.
        """
        response = openai.Completion.create(engine="YOUR_ENGINE_NAME", prompt=prompt, max_tokens=1000)
        return response.choices[0].text.strip()

    def process_file(self, file_content: str, target_language: str) -> str:
        input_type = self.identify_input_type(file_content)
        translatable_content = self.extract_translatable_content(file_content, input_type)
        domain_specific_words = self.identify_domain_specific_words(translatable_content)
        translated_content = self.translate_content(translatable_content, target_language, domain_specific_words)
        reconstructed_output = self.reconstruct_output(file_content, translated_content, input_type)
        return reconstructed_output

In [2]:
import json
import xml.etree.ElementTree as ET
from typing import Dict, List, Union
from bs4 import BeautifulSoup
from azure.core.credentials import AzureKeyCredential
# from azure.ai.textanalytics import TextAnalyticsClient
# from azure.cognitiveservices.language.translatortext import TranslatorTextClient
# from azure.cognitiveservices.language.translatortext.models import TranslateRequestParams
# from msrest.authentication import ApiKeyCredentials
import openai

In [None]:

# Example usage for different file types
html_content = "<html><body><h1>Hello, World!</h1><p>This is a test.</p></body></html>"
xml_content = "<root><item>Translate me</item><item>And me too</item></root>"
json_content = '{"title": "Translate this", "description": "And this as well"}'
plain_text = "This is a plain text file that needs translation."

target_language = "es"  # Spanish

for content in [html_content, xml_content, json_content, plain_text]:
    translated_output = translation_service.process_file(content, target_language)
    print(f"Translated output: {translated_output}\n")

In [3]:
import openai
from typing import Dict, List, Union

In [22]:
from openai import OpenAI


class LLMTextProcessingService:
    def __init__(self, openai_key: str, openai_org:str, openai_proj:str):
        openai.api_key = openai_key
        openai.organization = openai_org
        openai.project = openai_proj

    def _query_llm(self, prompt: str) -> str:
        # response = openai.chat(
        #     engine="gpt-4o-2024-08-06",
        #     prompt=prompt,
        # )
        client = OpenAI(
            api_key = "",
            organization= "",
            project= "",
        )
        
        response = client.chat.completions.create(
            model="gpt-4o-2024-08-06",  # Use "gpt-4" or "gpt-3.5-turbo"
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=50
        )
        print(response.choices[0].text.strip())
    
    def identify_input_type(self, content: str) -> str:
        prompt = f"""Identify the type of the following content. 
        Possible types are HTML, XML, JSON, or plain text. 
        Respond with only the type in lowercase.
        Content: {content[:500]}"""
        
        return self._query_llm(prompt)

    def extract_translatable_content(self, content: str, input_type: str) -> List[str]:
        prompt = f"""Extract the translatable content from the following {input_type} input.
        Return the content as a Python list of strings, where each string is a separate piece of text to be translated.
        Do not include any markup or structural elements, only the text content.
        Input: {content}"""
        
        result = self._query_llm(prompt)
        return eval(result)  # Convert the string representation of a list to an actual list

    # def identify_domain_specific_words(self, content: List[str]) -> List[str]:
    #     prompt = f"""Identify domain-specific words or phrases from the following content.
    #     These are words that should not be translated, such as proper nouns, technical terms, or brand names.
    #     Return the identified words as a Python list of strings.
    #     Content: {' '.join(content[:100])}"""  # Limiting to first 100 items for prompt length
        
    #     result = self._query_llm(prompt)
    #     return eval(result)  # Convert the string representation of a list to an actual list

    def reconstruct_output(self, original_content: str, translated_content: List[str], input_type: str) -> str:
        prompt = f"""Reconstruct the original {input_type} format using the translated content.
        Original content structure: {original_content}
        Translated content: {translated_content}
        Ensure that the structure and formatting of the original content is preserved,
        replacing only the translatable text with the corresponding translated content.
        Return the reconstructed {input_type} content."""
        
        return self._query_llm(prompt, max_tokens=2000)

    def process_file(self, file_content: str) -> Dict[str, Union[str, List[str]]]:
        input_type = self.identify_input_type(file_content)
        translatable_content = self.extract_translatable_content(file_content, input_type)
        domain_specific_words = self.identify_domain_specific_words(translatable_content)
        
        return {
            "input_type": input_type,
            "translatable_content": translatable_content,
            "domain_specific_words": domain_specific_words,
            "original_content": file_content
        }

    def reconstruct_file(self, processed_data: Dict[str, Union[str, List[str]]], translated_content: List[str]) -> str:
        return self.reconstruct_output(
            processed_data["original_content"],
            translated_content,
            processed_data["input_type"]
        )

In [None]:
# Usage example
if __name__ == "__main__":
    openai_key = ""
    openai_org = ""
    openai_proj = ""

    llm_text_processing_service = LLMTextProcessingService(openai_key, openai_org, openai_proj)

    # Example usage for different file types
    html_content = "<html><body><h1>Hello, World!</h1><p>This is a test.</p></body></html>"
    xml_content = "<root><item>Translate me</item><item>And me too</item></root>"
    json_content = '{"title": "Translate this", "description": "And this as well"}'
    plain_text = "This is a plain text file that needs translation."

    for content in [html_content, xml_content, json_content, plain_text]:
        processed_data = llm_text_processing_service.process_file(content)
        print(f"Input Type: {processed_data['input_type']}")
        print(f"Translatable Content: {processed_data['translatable_content']}")
        print(f"Domain-specific Words: {processed_data['domain_specific_words']}")
        print("---")

        # Simulating translation (replace this with your actual translation API call)
        translated_content = [f"Translated: {text}" for text in processed_data['translatable_content']]

        reconstructed_content = llm_text_processing_service.reconstruct_file(processed_data, translated_content)
        print(f"Reconstructed Content: {reconstructed_content}")
        print("=" * 50)

In [34]:
from openai import OpenAI 
import os

## Set the API key and model name
MODEL="gpt-4o-2024-08-06"
client = OpenAI(
    api_key="",
    organization="",
    project="",
    )

In [None]:
completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"}, # <-- This is the system message that provides context to the model
    {"role": "user", "content": "Hello! Could you solve 2+2?"}  # <-- This is the user message for which the model will generate a response
  ]
)

print("Assistant: " + completion.choices[0].message.content)

In [4]:
def extract_translatable_content(self, content: str, input_type: str) -> List[Dict[str, Union[str, List[str]]]]:
    prompt = f"""
        Analyze and extract the translatable content from the following {input_type} input.
        Return the content as a Python list of dictionaries, where each dictionary represents a translatable element with the 
        following structure:
        {[{
            "type": "The type of element (e.g., 'paragraph', 'heading', 'list_item', 'table_cell', etc.)",
            "content": "The actual text content to be translated",
            "metadata": {{
                "tag": "The original HTML tag or markdown syntax (if applicable)",
                "attributes": "Any relevant attributes (for HTML)",
                "formatting": "Any special formatting instructions"
            }},
            "non_translatable": ["List of domain-specific terms or phrases that should not be translated"]
        }]}

        Rules:
        1. Preserve the structure of the original {input_type} input.
        2. Identify and mark domain-specific terms or technical jargon as non-translatable.
        3. Include relevant metadata to aid in reconstructing the original format after translation.
        4. For plain text input, use "paragraph" as the type and omit the metadata.

        Input: {content}

        Ensure the output is a valid Python list of dictionaries.
    """
    
    result = self._query_llm(prompt)
    return eval(result)  # Convert the string representation of a list of dictionaries to the actual data structure

In [5]:
def reconstruct_output(self, original_content: str, extracted_structure: List[Dict[str, Union[str, List[str]]]], translated_content: List[str], input_type: str) -> str:
    prompt = f"""
        Reconstruct the original {input_type} format using the translated content and extracted structure.

        Original content: 
        {original_content}

        Extracted structure:
        {extracted_structure}

        Translated content (in order of extraction):
        {translated_content}

        Instructions:
        1. Use the extracted structure to guide the reconstruction process.
        2. Replace the original text in each extracted element with the corresponding translated text.
        3. Preserve all original formatting, tags, attributes, and non-translatable content.
        4. Ensure that the reconstructed content maintains the same structure and order as the original.
        5. For any domain-specific terms or proper nouns that were marked as non-translatable, use the original text.
        6. If there are any placeholders or variables in the original content, ensure they are correctly maintained in the translated version.

        Return the fully reconstructed {input_type} content, ensuring it's a valid and well-formatted {input_type} document.
    """

    result = self._query_llm(prompt)
    return result

In [6]:
def identify_input_type(self, content: str) -> str:
    prompt = f"""
        Analyze the following content and identify its type. Respond with a single word in lowercase, choosing from:
            - html
            - xml
            - json
            - markdown
            - yaml
            - csv
            - plaintext

        Use 'plaintext' if the content doesn't match any specific format.
        
        Guidelines:
        - Look for distinctive markers like HTML tags, XML declarations, JSON brackets, or Markdown syntax.
        - Consider structure and formatting, not just the presence of certain characters.
        - If multiple formats are present, choose the predominant one.

        Content: {content[:1000]}

        Type:
    
    """

    return self._query_llm(prompt).strip().lower()

In [None]:
def extract_content_for_translation(data):
    return [item["content"] for item in data]

In [None]:
# Assuming evaluated_result contains the list of dictionaries
translated_content = [element["content"] for element in evaluated_result]
print(translated_content)


In [7]:
def read_file_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [9]:
# Assuming you have an instance of the class that contains identify_input_type
# analyzer = YourClass()

# Read the contents of your files
html_content = read_file_content(r'C:\Users\asua\DataScience Exercises\Desktop\index.html')
# xml_content = read_file_content('path/to/your/xml_file.xml')
# json_content = read_file_content('path/to/your/json_file.json')

# Now you can call identify_input_type with the file contents
# html_type = analyzer.identify_input_type(html_content)
# xml_type = analyzer.identify_input_type(xml_content)
# json_type = analyzer.identify_input_type(json_content)

# print(f"HTML file type: {html_type}")
# print(f"XML file type: {xml_type}")
# print(f"JSON file type: {json_type}")

In [10]:
html_content

'<!DOCTYPE html>\n<html>\n<head>\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n    <meta name="viewport" content="width=1920"/>\n    <meta name="description"\n          content="Tizen Download API Demo" />\n\n    <title>Tizen Download API</title>\n\n    <script type=\'text/javascript\' src=\'$WEBAPIS/webapis/webapis.js\'></script>\n    <link rel="stylesheet" type="text/css" href="style.css" />\n\n</head>\n\n<body>\n\n    <h1>Tizen Download API</h1>\n\n    <div class="left">\n        <h2>Available buttons/actions:</h2>\n\n        <div>\n            1 - Downloaded file lists<br>\n            2 - Start download - small file<br>\n            3 - Start download - large file<br>\n            Pause - Pause download<br>\n            Play - Resume download<br>\n            Stop - Cancel download<br>\n            0 - Clear logs\n        </div>\n\n        <div id="progress" class="progress">\n            <div id="bar" class="bar"></div>\n        </div>\n\n        <fiel

In [None]:
# from pydantic import BaseModel, Field
# from typing import List, Dict, Union, Optional
# from openai import OpenAI

# client = OpenAI()
# MODEL = "gpt-3.5-turbo"  # or your preferred model

# class Metadata(BaseModel):
#     tag: Optional[str] = None
#     attributes: Optional[str] = None
#     formatting: Optional[str] = None

# class TranslatableElement(BaseModel):
#     type: str
#     content: str
#     metadata: Optional[Metadata] = None
#     non_translatable: List[str] = Field(default_factory=list)

# class TranslatableContent(BaseModel):
#     elements: List[TranslatableElement]

# def extract_translatable_content(content: str, input_type: str) -> List[Dict[str, Union[str, List[str]]]]:
#     prompt = f"""
#         Analyze and extract the translatable content from the following {input_type} input.
#         Return the content as a structured list of elements, where each element represents a translatable item.

#         Rules:
#         1. Preserve the structure of the original {input_type} input.
#         2. Identify and mark domain-specific terms or technical jargon as non-translatable.
#         3. Include relevant metadata to aid in reconstructing the original format after translation.
#         4. For plain text input, use "paragraph" as the type and omit the metadata.

#         Input: {content}

#         Ensure the output follows the structure defined in the TranslatableContent model.
#     """
    
#     completion = client.chat.completions.create(
#         model=MODEL,
#         messages=[
#             {"role": "system", "content": "You are a helpful assistant that extracts translatable content from various input types."},
#             {"role": "user", "content": prompt},
#         ],
#         response_model=TranslatableContent,
#     )

#     return [element.model_dump(exclude_none=True) for element in completion.choices[0].message.elements]

# # Example usage
# input_content = """
# <h1>Welcome to our website</h1>
# <p>We offer the best <strong>AI-powered</strong> solutions for your business.</p>
# <ul>
#     <li>Machine Learning</li>
#     <li>Natural Language Processing</li>
#     <li>Computer Vision</li>
# </ul>
# """

# result = extract_translatable_content(input_content, "HTML")
# print(result)

In [None]:
def extract_translatable_content(content: str, input_type: str) -> List[Dict[str, Union[str, List[str]]]]:
    prompt = f"""
        Analyze and extract the translatable content from the following {input_type} input.
        Return the content as a structured list of elements, where each element represents a translatable item.

        Rules:
        1. Preserve the structure of the original {input_type} input.
        2. Identify and mark domain-specific terms or technical jargon as non-translatable.
        3. Include relevant metadata to aid in reconstructing the original format after translation.
        4. For plain text input, use "paragraph" as the type and omit the metadata.

        Input: {content}

        Ensure the output follows the structure defined in the TranslatableContent model.
    """
    
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that extracts translatable content from various input types."},
            {"role": "user", "content": prompt},
        ],
        response_model=TranslatableContent,
    )

    return [element.model_dump(exclude_none=True) for element in completion.choices[0].message.elements]

# Example usage for different input types
html_input = "<h1>Welcome</h1><p>This is a paragraph.</p>"
xml_input = "<root><item>XML content</item></root>"
json_input = '{"key": "JSON value", "nested": {"subkey": "Nested value"}}'
plain_text_input = "This is plain text.\nIt has multiple lines."

print(extract_translatable_content(html_input, "HTML"))
print(extract_translatable_content(xml_input, "XML"))
print(extract_translatable_content(json_input, "JSON"))
print(extract_translatable_content(plain_text_input, "plain text"))

In [None]:
# from typing import List, Dict, Union
# from pydantic import BaseModel, Field
# from azure.openai import AzureOpenAI
# import os

# # Azure OpenAI settings
# azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
# api_key = os.getenv("AZURE_OPENAI_KEY")
# deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

# # Initialize Azure OpenAI client
# client = AzureOpenAI(
#     api_key=api_key,
#     api_version="2023-05-15",
#     azure_endpoint=azure_endpoint
# )

# class Metadata(BaseModel):
#     tag: str = None
#     attributes: str = None
#     formatting: str = None

# class TranslatableElement(BaseModel):
#     type: str
#     content: str
#     metadata: Metadata = None
#     non_translatable: List[str] = Field(default_factory=list)

# class TranslatableContent(BaseModel):
#     elements: List[TranslatableElement]

# def extract_translatable_content(content: str, input_type: str) -> List[Dict[str, Union[str, List[str]]]]:
#     prompt = f"""
#         Analyze and extract the translatable content from the following {input_type} input.
#         Return the content as a structured list of elements, where each element represents a translatable item.

#         Rules:
#         1. Preserve the structure of the original {input_type} input.
#         2. Identify and mark domain-specific terms or technical jargon as non-translatable.
#         3. Include relevant metadata to aid in reconstructing the original format after translation.
#         4. For plain text input, use "paragraph" as the type and omit the metadata.

#         Input: {content}

#         Ensure the output follows the structure defined in the TranslatableContent model.
#     """
    
#     try:
#         response = client.chat.completions.create(
#             model=deployment_name,  # Use the deployment name here
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant that extracts translatable content from various input types."},
#                 {"role": "user", "content": prompt},
#             ],
#             response_format={"type": "json_object"}
#         )
        
#         # Parse the JSON response
#         result = TranslatableContent.parse_raw(response.choices[0].message.content)
#         return [element.model_dump(exclude_none=True) for element in result.elements]
    
#     except Exception as e:
#         print(f"Error occurred: {str(e)}")
#         return []

# # Example usage
# html_input = "<h1>Welcome to Azure</h1><p>This is a cloud service.</p>"
# result = extract_translatable_content(html_input, "HTML")
# print(result)

In [None]:
from typing import List, Dict, Union
from pydantic import BaseModel, Field
from azure.openai import AzureOpenAI
import os

# Azure OpenAI settings
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_key = os.getenv("AZURE_OPENAI_KEY")
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-05-15",
    azure_endpoint=azure_endpoint
)

class Metadata(BaseModel):
    tag: str = None
    attributes: str = None
    formatting: str = None

class TranslatableElement(BaseModel):
    type: str
    content: str
    metadata: Metadata = None
    non_translatable: List[str] = Field(default_factory=list)

class TranslatableContent(BaseModel):
    elements: List[TranslatableElement]

def extract_translatable_content(content: str, input_type: str) -> List[Dict[str, Union[str, List[str]]]]:
    prompt = f"""
        Analyze and extract the translatable content from the following {input_type} input.
        Return the content as a structured list of elements, where each element represents a translatable item.

        Rules:
        1. Preserve the structure of the original {input_type} input.
        2. Identify and mark domain-specific terms or technical jargon as non-translatable.
        3. Include relevant metadata to aid in reconstructing the original format after translation.
        4. For plain text input, use "paragraph" as the type and omit the metadata.

        Input: {content}

        Ensure the output follows the structure defined in the TranslatableContent model.
    """
    
    try:
        completion = client.chat.completions.create(
            model=deployment_name,  # Use the deployment name here
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts translatable content from various input types."},
                {"role": "user", "content": prompt},
            ],
            response_model=TranslatableContent
        )
        
        return [element.model_dump(exclude_none=True) for element in completion.choices[0].message.elements]
    
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return []

# Example usage
html_input = "<h1>Welcome to Azure</h1><p>This is a cloud service.</p>"
result = extract_translatable_content(html_input, "HTML")
print(result)


def extract_content(data):
    """
    Extract only the 'content' field from the input data.

    Args:
    data (list): A list of dictionaries, each containing a 'content' key.

    Returns:
    list: A list of strings, each string being the 'content' value.
    """
    return [item['content'] for item in data]

In [None]:
from pydantic import BaseModel, Field
from typing import Literal
from azure.openai import AzureOpenAI
import os

# Azure OpenAI setup
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

class InputTypeResponse(BaseModel):
    input_type: Literal['html', 'xml', 'json', 'markdown', 'yaml', 'csv', 'plaintext'] = Field(
        ...,
        description="The identified type of the input content"
    )

class ContentAnalyzer:
    def identify_input_type(self, content: str) -> str:
        prompt = f"""
        Analyze the following content and identify its type. Respond with a single word in lowercase, choosing from:
        - html
        - xml
        - json
        - markdown
        - yaml
        - csv
        - plaintext

        Use 'plaintext' if the content doesn't match any specific format.
        
        Guidelines:
        - Look for distinctive markers like HTML tags, XML declarations, JSON brackets, or Markdown syntax.
        - Consider structure and formatting, not just the presence of certain characters.
        - If multiple formats are present, choose the predominant one.

        Content: {content[:1000]}

        Respond only with the type, nothing else.
        """

        try:
            completion = client.chat.completions.create(
                model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
                messages=[
                    {"role": "system", "content": "You are a content type analyzer."},
                    {"role": "user", "content": prompt},
                ],
                response_model=InputTypeResponse
            )
            
            return completion.choices[0].message.input_type
        except Exception as e:
            print(f"Error occurred: {str(e)}")
            return "unknown"

# Example usage
if __name__ == "__main__":
    analyzer = ContentAnalyzer()
    
    test_contents = [
        "<html><body><h1>Hello</h1></body></html>",
        "<?xml version='1.0'?><root><element>Content</element></root>",
        '{"key": "value", "array": [1, 2, 3]}',
        "# Markdown Header\n\nThis is some markdown content.",
        "key: value\nnested:\n  subkey: subvalue",
        "column1,column2,column3\nvalue1,value2,value3",
        "Just some plain text content."
    ]

    for content in test_contents:
        input_type = analyzer.identify_input_type(content)
        print(f"Identified type: {input_type}")
        print(f"For content: {content[:50]}...")
        print()

In [None]:
from pydantic import BaseModel, Field
from typing import List, Dict, Union
from azure.openai import AzureOpenAI
import os

# Azure OpenAI setup
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

class ReconstructedContent(BaseModel):
    content: str = Field(..., description="The reconstructed content in its original format")
    input_type: str = Field(..., description="The type of the input/output content (e.g., 'html', 'xml', 'json')")

class ContentReconstructor:
    def reconstruct_output(
        self, 
        original_content: str, 
        extracted_structure: List[Dict[str, Union[str, List[str]]]], 
        translated_content: List[str], 
        input_type: str
    ) -> str:
        prompt = f"""
        Reconstruct the original {input_type} format using the translated content and extracted structure.

        Original content: 
        {original_content}  # Limiting to first 1000 characters for brevity

        Extracted structure:
        {extracted_structure}

        Translated content (in order of extraction):
        {translated_content}

        Instructions:
        1. Use the extracted structure to guide the reconstruction process.
        2. Replace the original text in each extracted element with the corresponding translated text.
        3. Preserve all original formatting, tags, attributes, and non-translatable content.
        4. Ensure that the reconstructed content maintains the same structure and order as the original.
        5. For any domain-specific terms, technical terms or proper nouns that were marked as non-translatable, use the original text.
        6. If there are any placeholders or variables in the original content, ensure they are correctly maintained in the translated version.

        Return the fully reconstructed {input_type} content, ensuring it's a valid and well-formatted {input_type} document.
        """

        try:
            completion = client.chat.completions.create(
                model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
                messages=[
                    {"role": "system", "content": "You are a content reconstruction specialist."},
                    {"role": "user", "content": prompt},
                ],
                response_model=ReconstructedContent
            )
            
            result = completion.choices[0].message
            return result.content  # Return just the content string
        except Exception as e:
            print(f"Error occurred: {str(e)}")
            return ""

# Example usage
if __name__ == "__main__":
    reconstructor = ContentReconstructor()
    
    # Example inputs (you would replace these with your actual data)
    original_content = "<html><body><h1>Hello</h1><p>World</p></body></html>"
    extracted_structure = [
        {"type": "heading", "content": "Hello", "metadata": {"tag": "h1"}},
        {"type": "paragraph", "content": "World", "metadata": {"tag": "p"}}
    ]
    translated_content = ["Bonjour", "le monde"]
    input_type = "html"

    reconstructed = reconstructor.reconstruct_output(
        original_content, 
        extracted_structure, 
        translated_content, 
        input_type
    )

    print("Reconstructed content:")
    print(reconstructed)

In [None]:
import json
from typing import List, Dict, Union
from pydantic import BaseModel, Field
from azure.openai import AzureOpenAI
import os

# Azure OpenAI settings
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_key = os.getenv("AZURE_OPENAI_KEY")
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-05-15",
    azure_endpoint=azure_endpoint
)

class Metadata(BaseModel):
    tag: str = None
    attributes: str = None
    formatting: str = None

class TranslatableElement(BaseModel):
    type: str
    content: str
    metadata: Metadata = None
    non_translatable: List[str] = Field(default_factory=list)

class TranslatableContent(BaseModel):
    elements: List[TranslatableElement]

def load_rules(file_path: str = 'translation_rules.json') -> Dict:
    """Load translation rules from a JSON file."""
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Rules file not found: {file_path}")
        return {}
    except json.JSONDecodeError:
        print(f"Invalid JSON in rules file: {file_path}")
        return {}

def extract_translatable_content(content: str, input_type: str) -> List[Dict[str, Union[str, List[str]]]]:
    # Load rules
    rules = load_rules()
    
    # Construct prompt with loaded rules
    rules_text = "\n".join(rules.get('extraction_rules', []))
    prompt = f"""
        Analyze and extract the translatable content from the following {input_type} input.
        Return the content as a structured list of elements, where each element represents a translatable item.

        Rules:
        {rules_text}

        Additional instructions:
        1. Preserve the structure of the original {input_type} input.
        2. Include relevant metadata to aid in reconstructing the original format after translation.
        3. For plain text input, use "paragraph" as the type and omit the metadata.

        Input: {content}

        Ensure the output follows the structure defined in the TranslatableContent model.
    """
    
    try:
        completion = client.chat.completions.create(
            model=deployment_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts translatable content from various input types."},
                {"role": "user", "content": prompt},
            ],
            response_model=TranslatableContent
        )
        
        return [element.model_dump(exclude_none=True) for element in completion.choices[0].message.elements]
    
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return []

# Example usage
if __name__ == "__main__":
    html_input = "<h1>Welcome to Azure</h1><p>This is a cloud service.</p>"
    result = extract_translatable_content(html_input, "HTML")
    print(result)