Primeiro, baixe as bibliotecas do python em um ambiente virtual

python -m venv .venv

cd .venv\Scripts\activate

pip install pandas

pip install numpy

pip install pyarrow

pip install parquet

pip install scikit-learn

pip install matplotlib

pip install seaborn

In [1]:
import pandas as pd
import numpy as np
import random

from datetime import datetime, timedelta

pd.set_option("display.max_columns", None)
random.seed(42)
np.random.seed(42)


In [2]:
# Generate products_raw.json

N_PRODUCTS = 10_000

categories = ["Rolamentos", "Mancais", "Componentes Mecânicos"]
bearing_types = ["Esférico", "Cilíndrico", "Autocompensador", "Agujas", "Contato Angular"]
materials = ["Aço", "Aço Inoxidável", "Cerâmica"]
problem_types = ["Vibração", "Superaquecimento", "Desgaste", "Contaminação"]

products = []

for i in range(N_PRODUCTS):
    products.append({
        "product_id": f"P{i+1:05d}",
        "product_name": f"Rolamento Industrial {i+1}",
        "product_category": random.choice(categories),
        "product_subcategory": random.choice(bearing_types),
        "manufacturer": random.choice(["SKF", "NSK", "FAG", "Timken", "NTN"]),
        "model": f"MD-{random.randint(100,999)}",
        "bearing_type": random.choice(bearing_types),
        "material": random.choice(materials),
        "load_capacity": round(random.uniform(500, 50000), 2),
        "max_speed": random.randint(1000, 15000),
        "temperature_limit": random.randint(80, 250),
        "problem_type": random.choice(problem_types),
        "unit_cost": round(random.uniform(50, 500), 2),
        "list_price": round(random.uniform(200, 3000), 2)
    })


products_df = pd.DataFrame(products)

# Opção 1: Regenerar dados com constraint - resolução de margem de lucro quebrada
products_df = products_df[products_df['unit_cost'] < products_df['list_price']]

products_df.to_json("../data/raw/products_raw.json", orient="records", indent=2)

products_df.head()


Unnamed: 0,product_id,product_name,product_category,product_subcategory,manufacturer,model,bearing_type,material,load_capacity,max_speed,temperature_limit,problem_type,unit_cost,list_price
1,P00002,Rolamento Industrial 2,Rolamentos,Cilíndrico,NSK,MD-617,Contato Angular,Aço,28281.63,12731,246,Contaminação,149.2,1849.94
2,P00003,Rolamento Industrial 3,Rolamentos,Cilíndrico,Timken,MD-448,Autocompensador,Aço,11158.03,13509,166,Vibração,91.74,470.81
3,P00004,Rolamento Industrial 4,Mancais,Contato Angular,FAG,MD-926,Esférico,Cerâmica,23241.53,3045,176,Vibração,298.42,2522.33
4,P00005,Rolamento Industrial 5,Componentes Mecânicos,Autocompensador,NTN,MD-296,Esférico,Aço,33232.53,13665,154,Vibração,434.89,2626.15
5,P00006,Rolamento Industrial 6,Mancais,Autocompensador,Timken,MD-750,Autocompensador,Aço,18823.96,4432,148,Vibração,324.11,679.19


In [3]:
# Generate customers_raw.csv

N_CUSTOMERS = 5_000

industries = [
    "Mineração", "Siderurgia", "Alimentos", "Automotiva",
    "Papel e Celulose", "Química", "Cimento", "Energia"
]

# Industry mapping com problemas esperados - resolução de campo descritivo para feature engineering
industry_problem_map = {
    "Mineração": ["Vibração", "Desgaste", "Contaminação"],
    "Siderurgia": ["Superaquecimento", "Vibração", "Desgaste"],
    "Alimentos": ["Contaminação", "Desgaste"],
    "Automotiva": ["Vibração", "Desgaste"],
    "Papel e Celulose": ["Contaminação", "Desgaste"],
    "Química": ["Corrosão", "Superaquecimento"],
    "Cimento": ["Vibração", "Desgaste"],
    "Energia": ["Superaquecimento", "Vibração"]
}

customers = []

for i in range(N_CUSTOMERS):
    industry = random.choice(industries)
    customers.append({
        "customer_id": f"C{i+1:05d}",
        "company_name": f"Empresa Industrial {i+1}",
        "industry": industry,
        "company_size": random.choice(["Pequena", "Média", "Grande"]),
        "maintenance_model": random.choice(["Interna", "Terceirizada", "Mista"]),
        "equipment_criticality": random.choice(["Baixa", "Média", "Alta"]),
        "expected_problems": industry_problem_map.get(industry, []),
        "annual_revenue_estimated": round(random.uniform(5e6, 5e9), 2),
        "maintenance_budget_annual": round(random.uniform(50_000, 5_000_000), 2),
        "downtime_cost_per_hour": round(random.uniform(1_000, 50_000), 2),
        "preferred_supplier": random.choice([True, False]),
        "relationship_start_date": (
            datetime(2000,1,1) + timedelta(days=random.randint(0,9000))
        ).date(),
        "active": random.choice([True, True, True, False]),
        "last_updated": datetime.now()
    })

customers_df = pd.DataFrame(customers)
customers_df.to_csv("../data/raw/customers_raw.csv", index=False)

customers_df.head()


Unnamed: 0,customer_id,company_name,industry,company_size,maintenance_model,equipment_criticality,expected_problems,annual_revenue_estimated,maintenance_budget_annual,downtime_cost_per_hour,preferred_supplier,relationship_start_date,active,last_updated
0,C00001,Empresa Industrial 1,Química,Grande,Terceirizada,Baixa,"[Corrosão, Superaquecimento]",1474241000.0,2824887.23,3170.4,False,2016-10-22,True,2026-01-02 22:58:37.967216
1,C00002,Empresa Industrial 2,Automotiva,Grande,Mista,Média,"[Vibração, Desgaste]",3453635000.0,2913305.71,41838.33,True,2022-03-26,True,2026-01-02 22:58:37.967272
2,C00003,Empresa Industrial 3,Alimentos,Pequena,Mista,Alta,"[Contaminação, Desgaste]",4505871000.0,154697.84,12088.05,True,2022-10-12,True,2026-01-02 22:58:37.967302
3,C00004,Empresa Industrial 4,Automotiva,Pequena,Mista,Alta,"[Vibração, Desgaste]",1170286000.0,663312.06,10768.29,False,2001-10-06,True,2026-01-02 22:58:37.967328
4,C00005,Empresa Industrial 5,Automotiva,Pequena,Interna,Média,"[Vibração, Desgaste]",3405395000.0,1099120.2,39752.67,True,2013-04-30,True,2026-01-02 22:58:37.967343


In [4]:
# Generate sales_raw.csv

N_SALES = 120_000
START_DATE = datetime(2023, 1, 1)
END_DATE = datetime(2025, 12, 31)

def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

sales = []

for i in range(N_SALES):
    product = products_df.sample(1).iloc[0]
    customer = customers_df.sample(1).iloc[0]

    quantity = random.randint(1, 40)
    unit_price = round(product["list_price"] * random.uniform(0.9, 1.05), 2)

    sales.append({
        "sale_id": f"S{i+1:07d}",
        "sale_date": random_date(START_DATE, END_DATE).date(),
        "customer_id": customer["customer_id"],
        "product_id": product["product_id"],
        "quantity": quantity,
        "unit_price": unit_price,
        "total_price": round(quantity * unit_price, 2),
        "discount_percentage": random.choice([0, 5, 10, 15]),
        "sales_channel": random.choice(["Direct", "Distributor", "Representative"]),
        "contract_type": random.choice(["Spot", "Recurring", "SLA"]),
        "payment_terms": random.choice(["30 days", "60 days", "90 days"]),
        "delivery_lead_time_days": random.randint(3, 25),
        "sale_status": random.choice(["Completed", "Completed", "Completed", "Cancelled"]),
        "last_updated": datetime.now()
    })

sales_df = pd.DataFrame(sales)
sales_df.to_csv("../data/raw/sales_raw.csv", index=False)

sales_df.head()


Unnamed: 0,sale_id,sale_date,customer_id,product_id,quantity,unit_price,total_price,discount_percentage,sales_channel,contract_type,payment_terms,delivery_lead_time_days,sale_status,last_updated
0,S0000001,2023-11-02,C00567,P06773,9,2474.01,22266.09,15,Distributor,Spot,60 days,25,Completed,2026-01-02 22:58:41.092284
1,S0000002,2025-05-01,C04510,P01027,5,827.72,4138.6,0,Representative,SLA,90 days,16,Completed,2026-01-02 22:58:41.099095
2,S0000003,2023-11-12,C01346,P08663,26,611.03,15886.78,5,Distributor,Spot,30 days,17,Cancelled,2026-01-02 22:58:41.101476
3,S0000004,2025-05-09,C03136,P04144,5,974.8,4874.0,10,Distributor,Recurring,90 days,20,Cancelled,2026-01-02 22:58:41.103757
4,S0000005,2023-05-20,C03235,P07099,20,2113.65,42273.0,15,Distributor,Recurring,30 days,7,Completed,2026-01-02 22:58:41.105974


In [5]:
# Validate referential integrity

print("Produtos:", products_df.shape)
print("Clientes:", customers_df.shape)
print("Vendas:", sales_df.shape)

assert sales_df["product_id"].isin(products_df["product_id"]).all()
assert sales_df["customer_id"].isin(customers_df["customer_id"]).all()

print("✔ Integridade referencial validada")


Produtos: (9641, 14)
Clientes: (5000, 14)
Vendas: (120000, 14)
✔ Integridade referencial validada


In [None]:
# Structure

# data/
#  └── raw/
#      ├── products_raw.json   ✔ ~10.000 
#      ├── customers_raw.csv   ✔ 5.000
#      └── sales_raw.csv       ✔ 120.000