In [None]:
import requests
import PIL
from bs4 import BeautifulSoup
from PIL import Image
import pytesseract
from io import BytesIO
from deep_translator import GoogleTranslator
import webbrowser
import os
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Path to Tesseract OCR executable
# pytesseract.pytesseract.tesseract_cmd = r'C:\Users\PATH\Tesseract-OCR\tesseract.exe'

# Define the website URL
web_url = 'https://www.lge.co.kr/'

# Initialize translator
translator = GoogleTranslator(source='ko', target='en')

# Extract text from an image URL using Tesseract OCR
def extract_text_from_image_url(image_url):
    try:
        response = requests.get(image_url)
        response.raise_for_status() # Check for request success
        image_data = BytesIO(response.content)
        image = Image.open(image_data)
        extracted_text = pytesseract.image_to_string(image, lang='eng')
        return extracted_text.strip()
    except requests.exceptions.RequestException as req_err:
        print("Text extraction from image URL failed:", req_err)
        return ""
    except Exception as e:
        print("Text extraction from image URL failed:", e)
        return ""
    except PIL.UnidentifiedImageError as img_err:
        print("Text extraction from image URL failed: Unsupported image format")
        return ""

# Translate text using Google Translator
def translate_text(text):
    try:
        if text:
            translated_text = translator.translate(text)
            return translated_text
        else:
            return ""
    except Exception as e:
        print("Translation failed:", e)
        return ""

    return translated_text

# Modify HTML content with translated text and CSS
def modify_html_with_translated_text_and_css(html_content, translated_texts, css_files):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')

        # Add link to CSS files
        for css_file in css_files:
            css_link = soup.new_tag("link", rel="stylesheet", href=css_file)
            soup.head.append(css_link)

        # Iterate through img tags and replace alt text with translated text
        img_tags = soup.find_all('img')
        for img_tag in img_tags:
            if 'alt' in img_tag.attrs:
                img_tag['alt'] = translated_texts.get(img_tag['alt'], img_tag['alt'])  # Use the translated text from the dictionary if available

        # Replace text in all elements with translated text
        for element in soup.find_all(string=True):
            if element.parent and element.parent.name not in ['script', 'style']:
                translated_text = translated_texts.get(element, element)
                element.replace_with(translated_text)

        modified_html = soup.prettify()
        return modified_html
    except Exception as e:
        print("Modifying HTML content failed:", e)
        return None

# Download and save files from URLs
def download_and_save_files(files):
    try:
        for url in files:
            response = requests.get(url)
            response.raise_for_status()
            file_name = os.path.basename(url)
            # Remove invalid characters from the filename using regex
            file_name = re.sub(r'[\/:*?"<>|]', '_', file_name)
            with open(file_name, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded and saved: {file_name}")
    except requests.exceptions.RequestException as req_err:
        print("File download failed:", req_err)
    except Exception as e:
        print("File download failed:", e)

# Extract CSS and script files from the website
def extract_css_and_script_files(web_url):
    try:
        response = requests.get(web_url)
        response.raise_for_status()
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        css_files = []
        script_files = []

        # Extract CSS files
        css_tags = soup.find_all('link', rel='stylesheet')
        for css_tag in css_tags:
            css_url = css_tag.get('href')
            if css_url and not css_url.startswith(('http:', 'https:')):
                css_url = web_url + css_url if not css_url.startswith('/') else web_url + '/' + css_url
                css_files.append(css_url)

        # Extract script files
        script_tags = soup.find_all('script', src=True)
        for script_tag in script_tags:
            script_url = script_tag.get('src')
            if script_url and not script_url.startswith(('http:', 'https:')):
                script_url = web_url + script_url if not script_url.startswith('/') else web_url + '/' + script_url
                script_files.append(script_url)

        return css_files, script_files

    except requests.exceptions.RequestException as req_err:
        print("Request error while extracting files:", req_err)
        return [], []
    except Exception as e:
        print("An error occurred while extracting files:", e)
        return [], []

if __name__ == "__main__":
    try:
        # Extract CSS and script files
        css_files, script_files = extract_css_and_script_files(web_url)

        # Download and save script and CSS files
        download_and_save_files(script_files)
        download_and_save_files(css_files)

        # Start processing the web page
        response = requests.get(web_url)
        response.raise_for_status()
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        # Initialize a dictionary to store translated texts for each element
        translated_texts = {}

        for element in soup.find_all(string=True):
            if element.parent and element.parent.name not in ['script', 'style']:
                translated_text = translate_text(element)
                translated_texts[element] = translated_text  # Store the translated text

        # Iterate through img tags and process text
        img_tags = soup.find_all('img')
        for img_tag in img_tags:
            image_url = img_tag.get('src')
            if image_url and not image_url.startswith(('data:', 'http:', 'https:')):
                image_url = web_url + image_url if not image_url.startswith('/') else web_url + '/' + image_url
                extracted_text = extract_text_from_image_url(image_url)
                if extracted_text:
                    translated_text = translate_text(extracted_text)
                    if translated_text:
                        img_tag['alt'] = translated_text

            # Store the link address of the img tag in the image_url variable
            image_url = img_tag.get('src')
            if image_url:
                print("Image URL:", image_url)

        # Modify HTML with translated text and CSS
        modified_html = modify_html_with_translated_text_and_css(str(soup), translated_texts, css_files)

        # Save modified HTML and open in browser
        if modified_html:
            modified_file_path = 'modifiedO_page.html'
            with open(modified_file_path, 'w', encoding='utf-8') as f:
                f.write(modified_html)
            webbrowser.open(modified_file_path, new=2)
            print("Web page modified and opened in browser.")
        else:
            print("Failed to modify the HTML content.")

    except requests.exceptions.RequestException as req_err:
        print("Request error:", req_err)
    except Exception as e:
        print("An error occurred:", e)


In [1]:
!pip install pytesseract



In [5]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [7]:
!pip install Selenium

Collecting Selenium
  Downloading selenium-4.13.0-py3-none-any.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from Selenium)
  Downloading trio-0.22.2-py3-none-any.whl (400 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.2/400.2 kB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from Selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->Selenium)
  Downloading outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->Selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->Selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstall