#### Parser design using pydantic model

### Extracting the string content from the input

In [None]:
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
import json
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import io

app = FastAPI()

async def read_file_content(file: UploadFile) -> str:
    """
    Read and return the content of various file types uploaded via FastAPI.
    
    :param file: UploadFile object from FastAPI
    :return: Content of the file as a string
    """
    try:
        content = await file.read()
        content_str = content.decode()

        # Determine file type based on filename
        file_extension = file.filename.split('.')[-1].lower()

        if file_extension == 'json':
            # Parse JSON and format it
            parsed = json.loads(content_str)
            return json.dumps(parsed, indent=2)
        
        elif file_extension == 'xml':
            # Parse XML and format it
            root = ET.fromstring(content_str)
            return ET.tostring(root, encoding='unicode', method='xml')
        
        elif file_extension == 'html':
            # Parse HTML and format it
            soup = BeautifulSoup(content_str, 'html.parser')
            return soup.prettify()
        
        else:
            # For other file types (including .txt), return content as is
            return content_str

    except json.JSONDecodeError:
        raise HTTPException(status_code=400, detail="Invalid JSON file")
    except ET.ParseError:
        raise HTTPException(status_code=400, detail="Invalid XML file")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}")

@app.post("/upload/")
async def upload_file(file: UploadFile = File(...)):
    content = await read_file_content(file)
    return JSONResponse(content={"filename": file.filename, "content": content})


### Detecting the input type

In [None]:
from pydantic import BaseModel, Field
from typing import Literal
from azure.openai import AzureOpenAI
import os

# Azure OpenAI setup
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

class InputTypeResponse(BaseModel):
    input_type: Literal['html', 'xml', 'json', 'markdown', 'yaml', 'csv', 'plaintext'] = Field(
        ...,
        description="The identified type of the input content"
    )

class ContentAnalyzer:
    def identify_input_type(self, content: str) -> str:
        prompt = f"""
        Analyze the following content and identify its type. Respond with a single word in lowercase, choosing from:
        - html
        - xml
        - json
        - markdown
        - yaml
        - csv
        - plaintext

        Use 'plaintext' if the content doesn't match any specific format.
        
        Guidelines:
        - Look for distinctive markers like HTML tags, XML declarations, JSON brackets, or Markdown syntax.
        - Consider structure and formatting, not just the presence of certain characters.
        - If multiple formats are present, choose the predominant one.

        Content: {content[:500]}

        Respond only with the type, nothing else.
        """

        try:
            completion = client.chat.completions.create(
                model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
                messages=[
                    {"role": "system", "content": "You are a content type analyzer."},
                    {"role": "user", "content": prompt},
                ],
                response_format=InputTypeResponse
            )
            
            return completion.choices[0].message.input_type
        except Exception as e:
            print(f"Error occurred: {str(e)}")
            return "unknown"

# Example usage
# if __name__ == "__main__":
#     analyzer = ContentAnalyzer()
    
#     test_contents = [
#         "<html><body><h1>Hello</h1></body></html>",
#         "<?xml version='1.0'?><root><element>Content</element></root>",
#         '{"key": "value", "array": [1, 2, 3]}',
#         "# Markdown Header\n\nThis is some markdown content.",
#         "key: value\nnested:\n  subkey: subvalue",
#         "column1,column2,column3\nvalue1,value2,value3",
#         "Just some plain text content."
#     ]

#     for content in test_contents:
#         input_type = analyzer.identify_input_type(content)
#         print(f"Identified type: {input_type}")
#         print(f"For content: {content[:50]}...")
#         print()

### Extracting the translatable content from the input

In [None]:
from typing import List, Dict, Union
from pydantic import BaseModel, Field
from typing import List, Optional
from azure.openai import AzureOpenAI
import os

# Azure OpenAI settings
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_key = os.getenv("AZURE_OPENAI_KEY")
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=api_key,
    api_version="2023-05-15",
    azure_endpoint=azure_endpoint
)

# class Metadata(BaseModel):
#     tag: str = None
#     attributes: str = None
#     formatting: str = None

# class TranslatableElement(BaseModel):
#     type: str
#     content: str
#     metadata: Metadata = None
#     non_translatable: List[str] = Field(default_factory=list)

# class TranslatableContent(BaseModel):
#     elements: List[TranslatableElement]

class Metadata(BaseModel):
    tag: Optional[str] = Field(None, description="Structural identifier for the content element")
    attributes: Optional[str] = Field(None, description="Additional properties or characteristics of the content element")
    formatting: Optional[str] = Field(None, description="Styling or layout information for the content")

class TranslatableElement(BaseModel):
    type: str = Field(..., description="Category or classification of the content element")
    content: str = Field(..., description="The actual text or data to be translated")
    metadata: Optional[Metadata] = Field(None, description="Supplementary information about the content structure and formatting")
    non_translatable: List[str] = Field(default_factory=list, description="Terms or phrases that should remain in their original form")

class TranslatableContent(BaseModel):
    elements: List[TranslatableElement] = Field(..., description="Collection of translatable components extracted from the input")

def extract_translatable_content(content: str, input_type: str) -> List[Dict[str, Union[str, List[str]]]]:
    prompt = f"""
        Analyze and extract the translatable content from the following {input_type} input.
        Return the content as a structured list of elements, where each element represents a translatable item.

        Rules:
        1. Preserve the structure of the original {input_type} input.
        2. Identify and mark domain-specific terms or technical jargon as non-translatable.
        3. Include relevant metadata to aid in reconstructing the original format after translation.
        4. For plain text input, use "paragraph" as the type and omit the metadata.

        Input: {content}

        Ensure the output follows the structure defined in the TranslatableContent model.
    """
    
    try:
        completion = client.chat.completions.create(
            model=deployment_name,  # Use the deployment name here
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts translatable content from various input types."},
                {"role": "user", "content": prompt},
            ],
            response_format=TranslatableContent
        )
        
        return [element.model_dump(exclude_none=True) for element in completion.choices[0].message.elements]
    
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return []


def extract_content(data):
    """
    Extract only the 'content' field from the input data.

    Args:
    data (list): A list of dictionaries, each containing a 'content' key.

    Returns:
    list: A list of strings, each string being the 'content' value.
    """
    return [item['content'] for item in data]


# Example usage
# html_input = "<h1>Welcome to Azure</h1><p>This is a cloud service.</p>"
# result = extract_translatable_content(html_input, "HTML")
# print(result)

### Loading Translation rules from JSON files for translatable content extraction

In [None]:
import json
from typing import List, Dict, Union
from pydantic import BaseModel, Field
# from azure.openai import AzureOpenAI
from typing import List, Optional
import openai
import os

# Azure OpenAI settings
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_key = os.getenv("AZURE_OPENAI_KEY")
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

# Initialize Azure OpenAI client
client = openai(
    api_key=api_key,
    api_version="2023-05-15",
    azure_endpoint=azure_endpoint
)

# class Metadata(BaseModel):
#     tag: str = None
#     attributes: str = None
#     formatting: str = None

# class TranslatableElement(BaseModel):
#     type: str
#     content: str
#     metadata: Metadata = None
#     non_translatable: List[str] = Field(default_factory=list)

# class TranslatableContent(BaseModel):
#     elements: List[TranslatableElement]

class Metadata(BaseModel):
    tag: Optional[str] = Field(None, description="Structural identifier for the content element")
    attributes: Optional[str] = Field(None, description="Additional properties or characteristics of the content element")
    formatting: Optional[str] = Field(None, description="Styling or layout information for the content")

class TranslatableElement(BaseModel):
    type: str = Field(..., description="Category or classification of the content element")
    content: str = Field(..., description="The actual text or data to be translated")
    metadata: Optional[Metadata] = Field(None, description="Supplementary information about the content structure and formatting")
    non_translatable: List[str] = Field(default_factory=list, description="Terms or phrases that should remain in their original form")

class TranslatableContent(BaseModel):
    elements: List[TranslatableElement] = Field(..., description="Collection of translatable components extracted from the input")

def load_rules(file_path: str = 'translation_rules.json') -> Dict:
    """Load translation rules from a JSON file."""
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Rules file not found: {file_path}")
        return {}
    except json.JSONDecodeError:
        print(f"Invalid JSON in rules file: {file_path}")
        return {}

def extract_translatable_content(content: str, input_type: str) -> List[Dict[str, Union[str, List[str]]]]:
    # Load rules
    rules = load_rules()
    
    # Construct prompt with loaded rules
    rules_text = "\n".join(rules.get('extraction_rules', []))
    prompt = f"""
        Analyze and extract the translatable content from the following {input_type} input.
        Return the content as a structured list of elements, where each element represents a translatable item.

        Rules:
        {rules_text}

        Additional instructions:
        1. Preserve the structure of the original {input_type} input.
        2. Include relevant metadata to aid in reconstructing the original format after translation.
        3. For plain text input, use "paragraph" as the type and omit the metadata.

        Input: {content}

        Ensure the output follows the structure defined in the TranslatableContent model.
    """
    
    try:
        completion = openai.chat.completions.create(
            model=deployment_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts translatable content from various input types based on certain given rules."},
                {"role": "user", "content": prompt},
            ],
            response_format=TranslatableContent
        )
        
        return [element.model_dump(exclude_none=True) for element in completion.choices[0].message.elements]
    
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return []

# # Example usage
# if __name__ == "__main__":
#     html_input = "<h1>Welcome to Azure</h1><p>This is a cloud service.</p>"
#     result = extract_translatable_content(html_input, "HTML")
#     print(result)




def extract_content(data):
    """
    Extract only the 'content' field from the input data.

    Args:
    data (list): A list of dictionaries, each containing a 'content' key.

    Returns:
    list: A list of strings, each string being the 'content' value.
    """
    return [item['content'] for item in data]

### Reconstructing the original input type from the translated content and saving 

In [None]:
from pydantic import BaseModel, Field
from typing import List, Dict, Union
from azure.openai import AzureOpenAI
import os

# Azure OpenAI setup
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

class ReconstructedContent(BaseModel):
    content: str = Field(..., description="The reconstructed content in its original format")
    input_type: str = Field(..., description="The type of the input/output content (e.g., 'html', 'xml', 'json')")

class ContentReconstructor:
    def reconstruct_output(
        self, 
        original_content: str, 
        extracted_structure: List[Dict[str, Union[str, List[str]]]], 
        translated_content: List[str], 
        input_type: str
    ) -> str:

        prompt = f"""
        Reconstruct the original {input_type} format using the translated content and extracted structure.

        Original content: 
        {original_content} 

        Extracted structure:
        {extracted_structure}

        Translated content (in order of extraction):
        {translated_content}

        Instructions:
        1. Use the extracted structure to guide the reconstruction process.
        2. Replace the original text in each extracted element with the corresponding translated text.
        3. Preserve all original formatting, tags, attributes, and non-translatable content.
        4. Ensure that the reconstructed content maintains the same structure and order as the original.
        5. For any domain-specific terms, technical terms or proper nouns that were marked as non-translatable, use the original text.
        6. If there are any placeholders or variables in the original content, ensure they are correctly maintained in the translated version.

        Return the fully reconstructed {input_type} content, ensuring it's a valid and well-formatted {input_type} document.
        """

        try:
            completion = client.chat.completions.create(
                model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
                messages=[
                    {"role": "system", "content": "You are a content reconstruction specialist."},
                    {"role": "user", "content": prompt},
                ],
                response_format=ReconstructedContent
            )
            
            result = completion.choices[0].message
            return result.content  # Return just the content string
        except Exception as e:
            print(f"Error occurred: {str(e)}")
            return ""
        
        
    def save_content_to_file(self, content: str, input_type: str, filename: str = None) -> str:
        """
        Save the reconstructed content to a file with the appropriate extension.
        
        :param content: The content to be saved
        :param input_type: The type of content (e.g., 'html', 'xml', 'json', 'plaintext')
        :param filename: Optional custom filename (without extension)
        :return: The path of the saved file
        """
        # Map input types to file extensions
        extension_map = {
            "html": "html",
            "xml": "xml",
            "json": "json",
            "plaintext": "txt",
            "text": "txt",
            "txt": "txt"
        }
        
        # Get the appropriate extension, defaulting to 'txt' if not found
        extension = extension_map.get(input_type.lower(), "txt")
        
        if filename is None:
            filename = f"reconstructed_content_{input_type}"
        
        # Ensure the filename doesn't already have the extension
        if not filename.endswith(f".{extension}"):
            full_filename = f"{filename}.{extension}"
        else:
            full_filename = filename
        
        try:
            with open(full_filename, 'w', encoding='utf-8') as file:
                file.write(content)
            print(f"Content saved successfully to {full_filename}")
            return full_filename
        except Exception as e:
            print(f"Error saving file: {str(e)}")
            return ""

# Example usage
# if __name__ == "__main__":
#     reconstructor = ContentReconstructor()
    
    # Example inputs (you would replace these with your actual data)
    # original_content = "<html><body><h1>Hello</h1><p>World</p></body></html>"
    # extracted_structure = [
    #     {"type": "heading", "content": "Hello", "metadata": {"tag": "h1"}},
    #     {"type": "paragraph", "content": "World", "metadata": {"tag": "p"}}
    # ]
    # translated_content = ["Bonjour", "le monde"]
    # input_type = "html"

    # reconstructed = reconstructor.reconstruct_output(
    #     original_content, 
    #     extracted_structure, 
    #     translated_content, 
    #     input_type
    # )

    # print("Reconstructed content:")
    # print(reconstructed)

In [None]:
# import json
# import xml.etree.ElementTree as ET
# from bs4 import BeautifulSoup
# import chardet

# def read_file_content(file_path: str) -> str:
#     """
#     Read and return the content of various file types.
    
#     :param file_path: Path to the file
#     :return: Content of the file as a string
#     """
#     try:
#         # Detect the file encoding
#         with open(file_path, 'rb') as file:
#             raw_data = file.read()
#             detected = chardet.detect(raw_data)
#             encoding = detected['encoding']

#         # Read the file content using the detected encoding
#         with open(file_path, 'r', encoding=encoding) as file:
#             content = file.read()

#         # Determine file type based on extension
#         file_extension = file_path.split('.')[-1].lower()

#         if file_extension == 'json':
#             # Parse JSON and format it
#             parsed = json.loads(content)
#             return json.dumps(parsed, indent=2)
        
#         elif file_extension == 'xml':
#             # Parse XML and format it
#             root = ET.fromstring(content)
#             return ET.tostring(root, encoding='unicode', method='xml')
        
#         elif file_extension == 'html':
#             # Parse HTML and format it
#             soup = BeautifulSoup(content, 'html.parser')
#             return soup.prettify()
        
#         else:
#             # For other file types (including .txt), return content as is
#             return content

#     except json.JSONDecodeError:
#         return "Error: Invalid JSON file"
#     except ET.ParseError:
#         return "Error: Invalid XML file"
#     except Exception as e:
#         return f"Error reading file: {str(e)}"

# # Example usage
# file_types = ['html', 'xml', 'json', 'txt']
# for file_type in file_types:
#     file_path = f"example.{file_type}"
#     content = read_file_content(file_path)
#     print(f"\nContent of {file_path}:")
#     print(content[:200] + "..." if len(content) > 200 else content)  # Print first 200 characters

In [None]:
# from fastapi import FastAPI, UploadFile, File, HTTPException
# from fastapi.responses import JSONResponse
# import json
# import xml.etree.ElementTree as ET
# from bs4 import BeautifulSoup
# import io

# app = FastAPI()

# async def read_file_content(file: UploadFile) -> str:
#     """
#     Read and return the content of various file types uploaded via FastAPI.
    
#     :param file: UploadFile object from FastAPI
#     :return: Content of the file as a string
#     """
#     try:
#         content = await file.read()
#         content_str = content.decode()

#         # Determine file type based on filename
#         file_extension = file.filename.split('.')[-1].lower()

#         if file_extension == 'json':
#             # Parse JSON and format it
#             parsed = json.loads(content_str)
#             return json.dumps(parsed, indent=2)
        
#         elif file_extension == 'xml':
#             # Parse XML and format it
#             root = ET.fromstring(content_str)
#             return ET.tostring(root, encoding='unicode', method='xml')
        
#         elif file_extension == 'html':
#             # Parse HTML and format it
#             soup = BeautifulSoup(content_str, 'html.parser')
#             return soup.prettify()
        
#         else:
#             # For other file types (including .txt), return content as is
#             return content_str

#     except json.JSONDecodeError:
#         raise HTTPException(status_code=400, detail="Invalid JSON file")
#     except ET.ParseError:
#         raise HTTPException(status_code=400, detail="Invalid XML file")
#     except Exception as e:
#         raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}")

# @app.post("/upload/")
# async def upload_file(file: UploadFile = File(...)):
#     content = await read_file_content(file)
#     return JSONResponse(content={"filename": file.filename, "content": content})
