In [1]:
import json
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adriasoria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/adriasoria/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adriasoria/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
ruta_entrada = "../../data/fashion_products_dataset.json"
ruta_salida = "../../data/productos_preprocesados.csv"

with open(ruta_entrada, "r", encoding="utf-8") as f:
    productos = json.load(f)

print(f"Cargados {len(productos)} productos correctamente")

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    """Preprocess the json file (title + body)"""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [w for w in tokens if len(w) > 1]
    return " ".join(tokens)

def clean_price(price):
    if isinstance(price, str):
        price = price.replace(",", "").strip()
        if price.replace('.', '', 1).isdigit():
            return float(price)
        else:
            return None
    try:
        return float(price)
    except:
        return None

def clean_discount(d):
    if isinstance(d, str):
        match = re.search(r'(\d+(\.\d+)?)', d)
        return float(match.group(1)) if match else None
    return None

def clean_rating(r):
    try:
        return float(r)
    except:
        return None

def clean_stock(s):
    if isinstance(s, bool):
        return int(s)
    if isinstance(s, str):
        s = s.lower()
        if s in ["true", "yes", "1"]:
            return 1
        elif s in ["false", "no", "0"]:
            return 0
    return None

productos_procesados = []
for p in productos:
    title_clean = preprocess_text(p.get("title", ""))
    desc_clean = preprocess_text(p.get("description", ""))

    details = p.get("product_details", [])
    if isinstance(details, list):
        processed_details = []
        for d in details:
            if isinstance(d, dict) and len(d) > 0:
                key = list(d.keys())[0].lower()
                value = preprocess_text(str(list(d.values())[0]))
                processed_details.append(f"{key}: {value}")
        details_clean = " | ".join(processed_details)
    else:
        details_clean = ""

    selling_price = clean_price(p.get("selling_price", ""))
    actual_price = clean_price(p.get("actual_price", ""))
    discount = clean_discount(p.get("discount", ""))
    rating = clean_rating(p.get("average_rating", ""))
    stock = clean_stock(p.get("out_of_stock", ""))


    productos_procesados.append({
        "pid": p.get("pid", ""),
        "title": title_clean,
        "description": desc_clean,
        "brand": p.get("brand", ""),
        "category": p.get("category", ""),
        "sub_category": p.get("sub_category", ""),
        "product_details": details_clean,
        "seller": p.get("seller", ""),
        "out_of_stock": stock,
        "selling_price": selling_price,
        "discount": discount,
        "actual_price": actual_price,
        "average_rating": rating,
        "url": p.get("url", "")
    })

# === Guardar CSV ===
df = pd.DataFrame(productos_procesados)
df.to_csv(ruta_salida, index=False, encoding="utf-8")

print(f"✅ Preprocesamiento completado: {len(df)} productos guardados en '{ruta_salida}'")



Cargados 28080 productos correctamente
✅ Preprocesamiento completado: 28080 productos guardados en '../../data/productos_preprocesados.csv'
