In [35]:
import json
import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

In [40]:
def translate_data(data: list[str], lang):
    headers = {
        "accept": "*/*",
        "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
        "content-type": "application/json+protobuf",
        "priority": "u=1, i",
        "sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "cross-site",
        "x-client-data": "CIu2yQEIprbJAQipncoBCKeXywEIk6HLAQia/swBCOmYzQEIhaDNAQ==",
        "x-goog-api-key": "AIzaSyATBXajvzQLTDHEQbcpq0Ihe0vWDHmO520",
    }

    url = "https://translate-pa.googleapis.com/v1/translateHtml"
    body = [[data, lang, "en"], "te_lib"]
    response = requests.post(url, headers=headers, json=body)
    if response.status_code == 200:
        try:
            return json.loads(response.text)[0]
        except Exception as ex:
            print(f'Error at google translate : {ex}\n{response.text}')
    else:
        print(f'Error at translation, Status code : {response.status_code}')
    return data


def get_spare_parts(soup: BeautifulSoup):
    spare_parts = soup.select(
        'div[class="imagemap-content accordion"] input.search-for-spareparts+div.row div.table-rows')
    spare_part_obj = []
    for spare_part in spare_parts:
        items = [item for item in spare_part.text.split('\n') if item]
        item_number = items[0]
        description = items[1]
        spare_part_obj.append({
            'Item Number': item_number,
            'Desctiption': description
        })
    return spare_part_obj


def _sanitize_filename(filename: str) -> str:
    invalid_chars = r'[<>:"/\\|?*\']'
    sanitized_filename = re.sub(invalid_chars, "_", filename)
    sanitized_filename = sanitized_filename.strip()
    sanitized_filename = sanitized_filename[:255]
    return sanitized_filename


def scrape_data(data):
    url = data['URL']
    catalog = data['Catalog']
    sgl_code = data['SGL']
    language = data['lang']
    try:
        response = requests.get(url)
        print(response.status_code)
        if response.status_code == 200:
            html = response.json()['html']
            soup = BeautifulSoup(html, 'html.parser')
            img_url = urljoin(url, soup.select_one('div.explodedViewsContainer img').get('src'))
            parts = get_spare_parts(soup)
            translated_parts = translate_data([part['Desctiption'] for part in parts], language)
            for part, translated_description in zip(parts, translated_parts):
                part['Description'] = translated_description
            img_filename = _sanitize_filename(f'{sgl_code}-{catalog}.jpg')
            return {
                'SGL': sgl_code,
                'Catalog': catalog,
                'Parts': parts,
                'Img Url': img_url,
                'Img Filename': img_filename
            }
        else:
            print('Response Error')
    except Exception as e:
        print(f'Exception at {sgl_code}')
    return None

In [41]:
with open(f"All Data.json", 'r') as json_file:
    all_data = json.load(json_file)

In [46]:
objs = []
for index, d in enumerate(all_data):
    print(f'On {index} out of {len(all_data)} : {d['SGL']}')
    val = scrape_data(d)
    if val:
        objs.append(val)

On 0 out of 36 : SGL0000075639
200
On 1 out of 36 : SGL0000075640
200
Exception at SGL0000075640
On 2 out of 36 : SGL0000075703
200
On 3 out of 36 : SGL0000002013
200
On 4 out of 36 : SGL0000002014
200
On 5 out of 36 : SGL0000002015
200
On 6 out of 36 : SGL0000002016
200
On 7 out of 36 : SGL0000002017
200
On 8 out of 36 : SGL0000002018
200
On 9 out of 36 : SGL0000002019
200
On 10 out of 36 : SGL0000002020
Exception at SGL0000002020
On 11 out of 36 : SGL0000002021
Exception at SGL0000002021
On 12 out of 36 : SGL0000002022
200
On 13 out of 36 : SGL0000002023
200
On 14 out of 36 : SGL0000002024
200
On 15 out of 36 : SGL0000002025
200
On 16 out of 36 : SGL0000002026
200
On 17 out of 36 : SGL0000002027
200
On 18 out of 36 : SGL0000002029
200
On 19 out of 36 : SGL0000002030
200
On 20 out of 36 : SGL0000002031
200
On 21 out of 36 : SGL0000002032
200
On 22 out of 36 : SGL0000002040
200
Exception at SGL0000002040
On 23 out of 36 : SGL0000002043
200
On 24 out of 36 : SGL0000002044
200
On 25 out 

In [49]:
with open(f"Scraped Data.json", 'w') as json_file:
    json_file.write(json.dumps(objs, indent=4))

In [48]:
objs

[{'SGL': 'SGL0000075639',
  'Catalog': 'Gasoline lawn tractor T 18-111.4 HDS-A V2',
  'Parts': [{'Item Number': '51400402',
    'Desctiption': 'Mittelkonsole Vorne R3000',
    'Description': 'Center console front R3000'},
   {'Item Number': '457760',
    'Desctiption': '9039-490-0994-a Gew.forms. Din7500-m6x16',
    'Description': '9039-490-0994-a Weight Forms. Din7500-m6x16'},
   {'Item Number': '49300601',
    'Desctiption': 'Trittplatte Rechts R9005',
    'Description': 'Tread plate right R9005'},
   {'Item Number': '49300701',
    'Desctiption': 'Trittplatte Links R9005',
    'Description': 'Tread plate left R9005'},
   {'Item Number': '49308001',
    'Desctiption': 'Konsole Motorhaube E R9005',
    'Description': 'Console bonnet E R9005'},
   {'Item Number': '746005',
    'Desctiption': 'Shr 6x 40 7985- 4.8-tx30 A3b',
    'Description': 'Shr6x40 7985-4.8-tx30 A3b'},
   {'Item Number': '705657',
    'Desctiption': 'Mu 6kt  6  En1663-  8  A2k',
    'Description': 'Mu 6kt 6 En1663- 8