In [10]:
import json
from collections import defaultdict
from typing import Dict, TypedDict, List
import boto3
import mimetypes
from io import BytesIO
import requests
import uuid

In [11]:
S3_BUCKET = "equipment-model-data"
boto3_session = boto3.Session(profile_name="dan")
s3_client = boto3_session.client(service_name='s3')

In [12]:
def print_s3_bucket_files():
    paginator = s3_client.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=S3_BUCKET):
        for obj in page.get('Contents', []):
            print(obj['Key'])  # Prints the file's key (path) in the bucket

In [13]:
class EquipmentData(TypedDict):
    name: str
    image_links: List[str]
    mpn: str
    description: str
    brands: List[str]
    categories: List[str]
    skus: List[str]
    grouping_category: str

In [14]:
with open("../files/cleaned_scraped_equipment.json", "r") as f:
    equipment_data: Dict[str, EquipmentData] = json.load(f)

In [16]:
grouped_equipment = defaultdict(lambda: defaultdict(dict))

In [17]:
def upload_base_equipment_data():
    for unique_name, e_dict in equipment_data.items():

        group = (e_dict['grouping_category'] or "unknown").replace(" ", "-")
        brand = e_dict['brands'][0].lower().replace(" ", "-")

        path = (
            f"equipment/{group}/{brand}/{unique_name}/data.json"
        )
        s3_client.put_object(
            Bucket=S3_BUCKET,
            Key=path,
            Body=json.dumps(e_dict),
            ContentType='application/json'
        )

In [None]:
try:
    with open("../files/successful_image_downloads.json", "r") as f:
        successful_downloads = set(json.load(f))
except Exception as exec:
    print(f"'successful_image_downloads' open failed: {exec}")
    successful_downloads = set()

try:
    with open("../files/failed_image_downloads.json", "r") as f:
        failed_downloads = set(json.load(f))
except Exception as exec:
    print(f"'failed_image_downloads' open failed: {exec}")
    failed_downloads = set()

for index, (unique_name, e_dict) in enumerate(equipment_data.items()):
    group = (e_dict['grouping_category'] or "unknown").replace(" ", "-")
    brand = e_dict['brands'][0].lower().replace(" ", "-")

    for image_link in e_dict['image_links']:
        if image_link in successful_downloads:
            continue

        image_uuid = str(uuid.uuid4())
        try:
            response = requests.get(image_link)
            if response.status_code == 200:
                image_bytes = BytesIO(response.content)
            else:
                raise ValueError(f"Failed to download image. Status code: {response.status_code}")

            content_type, _ = mimetypes.guess_type(image_link)
            if content_type is None:
                raise ValueError(f"Unknown file extension for URL: {image_link}")

            file_extension = content_type.split("/")[1]
            path = (
                f"equipment/{group}/{brand}/{unique_name}/{unique_name}-{image_uuid}.{file_extension}"
            )

            # Upload to S3
            s3_client.upload_fileobj(
                image_bytes, S3_BUCKET, path, ExtraArgs={'ContentType': content_type}
            )

            successful_downloads.add(image_link)
        except Exception as exec:
            print(f"Failure: {str(exec)}")
            failed_downloads.add(image_link)

    print(f"Equipment {index+1}/{len(equipment_data)} complete.")

with open("../files/successful_image_downloads.json", "w") as f:
    json.dump(list(successful_downloads), f)
with open("../files/failed_image_downloads.json", "w") as f:
    json.dump(list(failed_downloads), f)

print(f"Total successful downloads: {len(successful_downloads)}")
print(f"Total failed downloads: {len(failed_downloads)}")

Total successful downloads: 1
Total failed downloads: 0
