Feature Engineering com LLM (Produtos Industriais)

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

In [15]:
PRODUCTS_PATH = "../data/trusted/products_trusted.parquet"
CUSTOMERS_PATH = "../data/trusted/customers_trusted.parquet"

products = pd.read_parquet(PRODUCTS_PATH)
customers = pd.read_parquet(CUSTOMERS_PATH)

print(products.shape, customers.shape)

(9641, 17) (5000, 14)


In [16]:
# def build_product_text(row):
#     features = ", ".join(row['technical_description'])
    
#     text = (
#         f"Produto industrial do tipo {row['bearing_type']} "
#         f"fabricado em {row['material']}. "
#         f"Suporta carga de até {row['load_capacity']} N, "
#         f"velocidade máxima de {row['max_speed']} RPM "
#         f"e temperatura até {row['temperature_limit']} °C. "
#         f"Características técnicas: {features}."
#     )
#     return text

# products['llm_product_description'] = products.apply(build_product_text, axis=1)

# Já temos technical_description e technical_features do notebook 02
# Vamos apenas confirmar que existem
print("✓ Campos disponíveis:")
print(f"  - technical_description: {type(products['technical_description'].iloc[0])}")
print(f"  - technical_features: {type(products['technical_features'].iloc[0])}")
print(f"  - llm_product_description: {type(products['llm_product_description'].iloc[0])}")

✓ Campos disponíveis:
  - technical_description: <class 'str'>
  - technical_features: <class 'numpy.ndarray'>
  - llm_product_description: <class 'str'>


In [17]:
problem_keywords = {
    "Vibração": ["vibration", "stability", "balance"],
    "Desgaste": ["wear", "durability", "long life"],
    "Superaquecimento": ["heat", "temperature", "cooling"],
    "Corrosão": ["corrosion", "stainless", "humidity"],
    "Contaminação": ["sealed", "hygiene", "food"]
}

def infer_supported_problems(text):
    text_lower = text.lower()
    supported = []
    
    for problem, keywords in problem_keywords.items():
        if any(k in text_lower for k in keywords):
            supported.append(problem)
    
    return supported if supported else ["Uso Geral"]

products['supported_problems'] = products['llm_product_description'].apply(
    infer_supported_problems
)

In [18]:
mlb = MultiLabelBinarizer()

problem_features = mlb.fit_transform(products['supported_problems'])

problem_features_df = pd.DataFrame(
    problem_features,
    columns=[f"problem_{c}" for c in mlb.classes_]
)

products_ml = pd.concat(
    [products.reset_index(drop=True), problem_features_df],
    axis=1
)

In [19]:
# ✅ CORREÇÃO: Criar um MultiLabelBinarizer separado para clientes
import ast

# Garantir que expected_problems seja lista antes de passar pro MLB
print("\nVerificando tipo de expected_problems...")
first_val = customers['expected_problems'].iloc[0]
print(f"Tipo: {type(first_val)}, Valor: {first_val}")

# Se for string, converter para lista
if isinstance(first_val, str):
    print("   ⚠️  Convertendo strings para listas...")
    customers['expected_problems'] = customers['expected_problems'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x
    )
    print("   ✓ Conversão concluída")

# Agora sim, fazer o encoding
mlb_customers = MultiLabelBinarizer()
customer_problem_features = mlb_customers.fit_transform(customers['expected_problems'])

customer_problem_df = pd.DataFrame(
    customer_problem_features,
    columns=[f"problem_{c.lower().replace(' ', '_').replace('ã', 'a').replace('ç', 'c').replace('õ', 'o')}" for c in mlb_customers.classes_]
)

customers_ml = pd.concat(
    [customers.reset_index(drop=True), customer_problem_df],
    axis=1
)

print(f"\n✓ Features binárias criadas: {list(customer_problem_df.columns)}")



Verificando tipo de expected_problems...
Tipo: <class 'str'>, Valor: ['Corrosão', 'Superaquecimento']
   ⚠️  Convertendo strings para listas...
   ✓ Conversão concluída

✓ Features binárias criadas: ['problem_contaminacao', 'problem_corrosao', 'problem_desgaste', 'problem_superaquecimento', 'problem_vibracao']


In [20]:
FEATURES_PATH = "../data/refined/"

products_ml.to_parquet(f"{FEATURES_PATH}/products_features.parquet", index=False)
customers_ml.to_parquet(f"{FEATURES_PATH}/customers_features.parquet", index=False)

print("✅ Features geradas e salvas com sucesso")

✅ Features geradas e salvas com sucesso


In [21]:
# Selecionar colunas problem_* e converter tudo para numérico
problem_df = products_ml.filter(like="problem_").apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
print(problem_df.sum().sort_values(ascending=False))

problem_Uso Geral    9641
problem_type            0
dtype: int64


In [22]:
customers_ml.filter(like="problem_").sum().sort_values(ascending=False)

problem_desgaste            3767
problem_vibracao            3126
problem_superaquecimento    1902
problem_contaminacao        1889
problem_corrosao             614
dtype: int64