In [49]:
import json

# IMPORT ORIGINAL JSON DATASET
with open('mens-all-products.json', 'r') as f:
    data = json.load(f)

In [50]:
def extract_category_from_product(product):
    """
    Extract the appropriate clothing category from a product based on its name and description.
    """
    # Extract product name and description
    structured = product.get("structuredData", {})
    extracted = product.get("extractedData", {})
    
    product_name = extracted.get("productName", structured.get("name", "")).lower()
    description = extracted.get("description", structured.get("description", "")).lower()
    original_category = product.get("category", "").lower()
    
    # Combine for easier searching
    combined_text = f"{product_name} {description} {original_category}"
    
    # Define category mappings with keywords
    category_mappings = {
        "Tops": [
            "shirt", "tee", "t-shirt", "blouse", "polo", "tank top", "tank", "top", 
            "turtleneck", "tunic", "sweater", "sweatshirt", "hoodie", "pullover", 
            "cardigan", "vest", "crop top", "camisole", "halter", "button-up", 
            "dress", "gown", "maxi dress", "mini dress", "midi dress", "slip dress", 
            "bodycon", "shift dress", "wrap dress", "jumpsuit", "romper", "playsuit"
        ],
        "Bottoms": [
            "jeans", "pants", "trousers", "chinos", "joggers", "shorts", "skirt", 
            "leggings", "jeggings", "culottes", "flare", "wide-leg", "slim-fit", 
            "skinny", "cargo", "khakis"
        ],
        "Outerwear": [
            "jacket", "coat", "blazer", "parka", "raincoat", "trench", "puffer", 
            "windbreaker", "bomber", "denim jacket", "leather jacket", "overcoat"
        ],
        "Shoes": [
            "shoes", "sneakers", "boots", "sandals", "heels", "flats", "loafers", 
            "pumps", "oxfords", "mules", "espadrilles", "slippers"
        ],
        "Accessories": [
            "hat", "cap", "beanie", "scarf", "gloves", "socks", "belt", "tie", 
            "bag", "purse", "wallet", "sunglasses", "jewelry", "watch", "backpack"
        ]
    }
    
    # Check each category's keywords
    for category, keywords in category_mappings.items():
        if any(keyword in combined_text for keyword in keywords):
            return category
    
    # Special case handling for common items
    if "fit jeans" in combined_text or "denim" in combined_text:
        return "Bottoms"
    
    if "suit" in combined_text:
        if "swimsuit" in combined_text:
            return "Swimwear"
        return "Outerwear"  # Default suits to Outerwear
    
    # Handle original category parsing
    if "men-jean" in original_category or "women-jean" in original_category:
        return "Bottoms"
    if "men-shirt" in original_category or "women-shirt" in original_category:
        return "Tops"
    if "men-shoes" in original_category or "women-shoes" in original_category:
        return "Footwear"
    
    # Fallback based on category segments
    if "jacket" in original_category or "coat" in original_category:
        return "Outerwear"
    if "dress" in original_category:
        return "Dresses"
    if "accessori" in original_category:
        return "Accessories"
    
    # Default to generic clothing if we can't determine more specifically
    return "Clothing"



In [52]:
def categorize_zara_products(products):
    """
    Categorize a list of Zara product data and format according to requirements.
    """
    results = []
    
    for i, product in enumerate(products):
        # Extract category
        category = extract_category_from_product(product)
        
        # Get the image with 'e1' in the link
        prod_img_link = ""
        images = product.get("extractedData", {}).get("allImages", [])
        
        # Specifically look for "e1" in the URL
        for img in images:
            if "-e1" in img:
                prod_img_link = img
                break
        
     
        
        # Get price without $ symbol
        price = product.get("extractedData", {}).get("price", "")
        if price.startswith("$ "):
            price = price.replace("$ ", "")
        
        # Create the formatted product
        formatted_product = {
            "ID": i,
            "name": product.get("extractedData", {}).get("productName", ""),
            "description": product.get("extractedData", {}).get("description", ""),
            "category": category,
            "brand": product.get("structuredData", {}).get("brand", ""),
            "gender": "m" if "men" in product.get("category", "") else 
                     "f" if "women" in product.get("category", "") else "u",
            "price": price,
            "prodLink": product.get("url", ""),
            "prodImgLink": prod_img_link
        }
        
        results.append(formatted_product)
    
    return results



In [53]:
formatted_data = categorize_zara_products(data)

with open('final-mens-all-sellers.json', 'w') as f:
    json.dump(formatted_data, f, indent=2)