In [108]:
import json
from collections import defaultdict
from typing import Dict, TypedDict, List
import boto3
import mimetypes
from io import BytesIO
import requests
import uuid

In [109]:
S3_BUCKET = "equipment-model-data"
boto3_session = boto3.Session(profile_name="dan", region_name='eu-west-1')
s3_client = boto3_session.client(service_name='s3', region_name='eu-west-1')

In [110]:
def print_s3_bucket_files():
    paginator = s3_client.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=S3_BUCKET):
        for obj in page.get('Contents', []):
            print(obj['Key'])  # Prints the file's key (path) in the bucket
            break

In [111]:
class EquipmentData(TypedDict):
    name: str
    image_links: List[str]
    mpn: str
    description: str
    brands: List[str]
    categories: List[str]
    skus: List[str]
    grouping_category: str

In [112]:
with open("../files/cleaned_scraped_equipment.json", "r") as f:
    equipment_data: Dict[str, EquipmentData] = json.load(f)

In [113]:
grouped_equipment = defaultdict(lambda: defaultdict(dict))

In [114]:
def upload_base_equipment_data():
    for unique_name, e_dict in equipment_data.items():

        group = (e_dict['grouping_category'] or "unknown").replace(" ", "-")
        brand = e_dict['brands'][0].lower().replace(" ", "-")

        path = (
            f"equipment/{group}/{brand}/{unique_name}/data.json"
        )
        s3_client.put_object(
            Bucket=S3_BUCKET,
            Key=path,
            Body=json.dumps(e_dict),
            ContentType='application/json'
        )

In [115]:
# try:
#     with open("../files/successful_image_downloads.json", "r") as f:
#         successful_downloads = set(json.load(f))
# except Exception as exec:
#     print(f"'successful_image_downloads' open failed: {exec}")
#     successful_downloads = set()

# try:
#     with open("../files/failed_image_downloads.json", "r") as f:
#         failed_downloads = set(json.load(f))
# except Exception as exec:
#     print(f"'failed_image_downloads' open failed: {exec}")
#     failed_downloads = set()

# for index, (unique_name, e_dict) in enumerate(equipment_data.items()):
#     group = (e_dict['grouping_category'] or "unknown").replace(" ", "-")
#     brand = e_dict['brands'][0].lower().replace(" ", "-")

#     for image_link in e_dict['image_links']:
#         if image_link in successful_downloads:
#             continue

#         image_uuid = str(uuid.uuid4())
#         try:
#             response = requests.get(image_link)
#             if response.status_code == 200:
#                 image_bytes = BytesIO(response.content)
#             else:
#                 raise ValueError(f"Failed to download image. Status code: {response.status_code}")

#             content_type, _ = mimetypes.guess_type(image_link)
#             if content_type is None:
#                 raise ValueError(f"Unknown file extension for URL: {image_link}")

#             file_extension = content_type.split("/")[1]
#             path = (
#                 f"equipment/{group}/{brand}/{unique_name}/{unique_name}-{image_uuid}.{file_extension}"
#             )

#             # Upload to S3
#             s3_client.upload_fileobj(
#                 image_bytes, S3_BUCKET, path, ExtraArgs={'ContentType': content_type}
#             )

#             successful_downloads.add(image_link)
#         except Exception as exec:
#             print(f"Failure: {str(exec)}")
#             failed_downloads.add(image_link)

#     print(f"Equipment {index+1}/{len(equipment_data)} complete.")

# with open("../files/successful_image_downloads.json", "w") as f:
#     json.dump(list(successful_downloads), f)
# with open("../files/failed_image_downloads.json", "w") as f:
#     json.dump(list(failed_downloads), f)

# print(f"Total successful downloads: {len(successful_downloads)}")
# print(f"Total failed downloads: {len(failed_downloads)}")

In [None]:
with open("../files/failed_image_downloads.json", "r") as f:
    failed_downloads = set(json.load(f))

In [122]:
import json
import uuid
import mimetypes
from io import BytesIO
from playwright.async_api import async_playwright

try:
    with open("../files/successful_image_downloads.json", "r") as f:
        successful_downloads = set(json.load(f))
except Exception as exec:
    print(f"'successful_image_downloads' open failed: {exec}")
    successful_downloads = set()

try:
    with open("../files/failed_image_downloads.json", "r") as f:
        failed_downloads = set(json.load(f))
except Exception as exec:
    print(f"'failed_image_downloads' open failed: {exec}")
    failed_downloads = set()

new_failed_downloads = set()  # Track failures from the current run only

async def process_images(equipment_data):
    async with async_playwright() as playwright:
        async with await playwright.chromium.launch(headless=False) as browser:
            async with await browser.new_context() as context:
                async with await context.new_page() as page:

                    # Process each equipment item
                    for index, (unique_name, e_dict) in enumerate(equipment_data.items()):
                        group = (e_dict['grouping_category'] or "unknown").replace(" ", "-")
                        brand = e_dict['brands'][0].lower().replace(" ", "-")

                        for image_link in e_dict['image_links']:
                            # Skip if already successfully downloaded
                            if (
                                "www.technogym.com" not in image_link
                                or "assets.roguefitness.com" in image_link
                                or ".glb" in image_link
                                or image_link in successful_downloads
                            ):
                                if (
                                    "assets.roguefitness.com" in image_link
                                    or ".glb" in image_link
                                ):
                                    failed_downloads.add(image_link)
                                continue

                            image_uuid = str(uuid.uuid4())

                            # Go to the image URL directly and get response
                            response = await page.goto(image_link)
                            if response is None:
                                print(f"No response for image URL: {image_link}")
                                new_failed_downloads.add(image_link)
                                continue

                            # Retrieve the image bytes
                            image_bytes = await response.body()

                            # Determine MIME type and file extension
                            content_type, _ = mimetypes.guess_type(image_link)
                            if content_type is None:
                                if ".glb" in image_link:
                                    content_type = "model/gltf-binary"
                                else:
                                    print(f"Unknown file extension for URL: {image_link}")
                                    new_failed_downloads.add(image_link)
                                    continue

                            file_extension = content_type.split("/")[1]
                            path = f"equipment/{group}/{brand}/{unique_name}/{unique_name}-{image_uuid}.{file_extension}"

                            # Upload to S3 in memory
                            try:
                                s3_client.upload_fileobj(
                                    BytesIO(image_bytes),
                                    S3_BUCKET,
                                    path,
                                    ExtraArgs={'ContentType': content_type}
                                )
                                print(f"Image uploaded to S3: {path}")
                                successful_downloads.add(image_link)  # Mark as successful
                            except Exception as exec:
                                print(f"S3 upload failed: {exec}")
                                new_failed_downloads.add(image_link)  # Track as failed
                                return


                        print(f"Equipment {index + 1}/{len(equipment_data)} complete.")

# Run the async function``
await process_images(equipment_data)

# Save only newly failed downloads for the next retry
with open("../files/successful_image_downloads.json", "w") as f:
    json.dump(list(successful_downloads), f)

with open("../files/failed_image_downloads.json", "w") as f:
    json.dump(list(new_failed_downloads), f)

print(f"Total successful downloads: {len(successful_downloads)}")
print(f"Total failed downloads: {len(new_failed_downloads)}")


Equipment 1/3316 complete.
Equipment 2/3316 complete.
Equipment 3/3316 complete.
Equipment 4/3316 complete.
Equipment 5/3316 complete.
Equipment 6/3316 complete.
Equipment 7/3316 complete.
Equipment 8/3316 complete.
Equipment 9/3316 complete.
Equipment 10/3316 complete.
Equipment 11/3316 complete.
Equipment 12/3316 complete.
Equipment 13/3316 complete.
Equipment 14/3316 complete.
Equipment 15/3316 complete.
Equipment 16/3316 complete.
Equipment 17/3316 complete.
Equipment 18/3316 complete.
Equipment 19/3316 complete.
Equipment 20/3316 complete.
Equipment 21/3316 complete.
Equipment 22/3316 complete.
Equipment 23/3316 complete.
Equipment 24/3316 complete.
Equipment 25/3316 complete.
Equipment 26/3316 complete.
Equipment 27/3316 complete.
Equipment 28/3316 complete.
Equipment 29/3316 complete.
Equipment 30/3316 complete.
Equipment 31/3316 complete.
Equipment 32/3316 complete.
Equipment 33/3316 complete.
Equipment 34/3316 complete.
Equipment 35/3316 complete.
Equipment 36/3316 complete.
E

In [133]:
# with open("../files/successful_image_downloads.json", "w") as f:
#     json.dump(list(successful_downloads), f)
with open("../files/failed_image_downloads.json", "w") as f:
    downloads = list([link for link in failed_downloads if "rogue" not in link])
    json.dump(downloads, f, indent=3)