In [1]:
import json

# IMPORT ORIGINAL JSON DATASET
with open('mercari-set-final.json', 'r') as f:
    data = json.load(f)

In [2]:
# DEFINE CLOTHING FUNCTIONS
def get_category_names(items):
    """
    Given a list of dicts, return a list of all 'category_name' values.
    """
    return [item.get("category_name") for item in items]

def get_unique_category_names(items):
    """
    Given a list of dicts, return a list of all unique 'category_name' values.
    """
    return list(set(get_category_names(items)))

def get_clothing_categories(category_names):
    """
    Filters the category names to only include clothing-related items.
    """
    # Define a list of keywords related to clothing
    clothing_keywords = ["shirt", "pants", "dress", "jacket", "jeans", "sweater", "hoodie", "shorts", "skirt", "blouse", "coat", "suit", "t-shirt", "top", "clothing", "apparel", 'accessory', 'footwear', 'activewear', 'outerwear', 'underwear', 'lingerie', 'swimwear', 'sneakers', 'boots', 'sandals', 'flip-flops', 'slippers', 'pajamas', 'loungewear', 'robe', 'jumpsuit', 'romper', 'overalls', 'leggings', 'tights', 'stockings', 'socks', 'scarf', 'hat', 'beanie', 'cap', 'gloves']

    # Filter category names that contain any of the clothing-related keywords
    clothing_categories = [category for category in category_names if any(keyword in category.lower() for keyword in clothing_keywords)]
    
    return clothing_categories

In [3]:
category_names = get_unique_category_names(data)
print("Total number of categories:", len(category_names))
# print(category_names)

clothing_categories = get_clothing_categories(category_names)
print("Total number of clothing categories:", len(clothing_categories))

Total number of categories: 374
Total number of clothing categories: 126


In [4]:
def filter_and_reformat(data, clothing_categories):
    """
    Filters items from the JSON data that belong to the clothing categories,
    removes items with descriptions of fewer than 10 words, and reformats
    the JSON attributes to the specified structure.
    """
    reformatted_items = []
    for item in data:
        # Check if the category is in the clothing categories list
        if item.get("category_name", "").lower() not in [category.lower() for category in clothing_categories]:
            continue

        # Check if the description exists and has at least 10 words
        description = item.get("item_description", "")
        if len(description.split()) < 10:
            continue

        # Set gender based on category name
        if "women" in item.get("category_name", "").lower() or "girl" in item.get("category_name", "").lower() or "womenswear" in item.get("category_name", "").lower(): 
            item["gender_from_category"] = "f"
        elif "men" in item.get("category_name", "").lower() or "boy" in item.get("category_name", "").lower() or "menswear" in item.get("category_name", "").lower(): 
            item["gender_from_category"] = "m"

        # Reformat the item
        reformatted_item = {
            "ID": item.get("ID"),
            "name": item.get("name"),
            "description": description,
            "category": item.get("category_name", ""),
            "brand": item.get("brand_name", ""),
            "gender": item.get("gender_from_category", "u"), # Default to 'unisex' if 'gender' is missing
            "price": item.get("price"),
            "prodLink": item.get("prodLink", ""), 
            "prodImgLink": item.get("prodImgLink", "")
        }
        reformatted_items.append(reformatted_item)
    return reformatted_items

new_json_list = filter_and_reformat(data, clothing_categories)
print("Total number of clothing items:", len(new_json_list))

Total number of clothing items: 530


In [5]:
with open('reformatted-mercari-final.json', "w") as file:
    json.dump(new_json_list, file, indent=4)

In [7]:
# UPDATE PRODUCT IDS
def update_product_ids(data):
    """
    Updates the product IDs in the JSON data such that the first ID is 0,
    and each subsequent ID increments by 1.
    """
    for new_id, item in enumerate(data):
        item["ID"] = new_id  # Update the product ID
    return data

# Load the JSON data from the file
input_file_path = "reformatted-mercari-final.json"
with open(input_file_path, "r") as file:
    data = json.load(file)

# Update product IDs
updated_data = update_product_ids(data)

# Save the updated JSON data back to the same file
with open(input_file_path, "w") as file:
    json.dump(updated_data, file, indent=4)


In [None]:
# Function to append processed data to the existing JSON file
def process_and_append_to_file(existing_file_path, new_file_path, clothing_categories):
    """
    Processes a new JSON file, applies filtering and reformatting, and appends
    the results to an existing JSON file without overwriting the previous data.
    """
    # Step 1: Load the existing data from the existing file
    try:
        with open(existing_file_path, "r") as file:
            existing_data = json.load(file)
    except FileNotFoundError:
        existing_data = []

    # Step 2: Load the new data from the new file
    with open(new_file_path, "r") as file:
        new_data = json.load(file)

    # Step 3: Process the new data using the filter_and_reformat function
    processed_new_data = filter_and_reformat(new_data, clothing_categories)

    # Step 4: Append the processed new data to the existing data
    existing_data.extend(processed_new_data)

    # Step 5: Save the combined data back to the existing file
    with open(existing_file_path, "w") as file:
        json.dump(existing_data, file, indent=4)

    print(f"Appended {len(processed_new_data)} new items to {existing_file_path}.")

# Example usage
existing_file_path = "new-mercari-set1.json"
new_file_path = "mercari-set2.json" 
process_and_append_to_file(existing_file_path, new_file_path, clothing_categories)

Appended 328 new items to new-mercari-set1.json.
