In [1]:
from pydantic_settings import BaseSettings
from pydantic import Field, SecretStr
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from sqlalchemy import *
from sqlalchemy.orm import declarative_base, sessionmaker
from sqlalchemy.dialects.mysql import insert
import json
import hashlib

load_dotenv()

class Settings(BaseSettings):
    # Database
    DB_HOST: str = Field(default="localhost", env="DB_HOST")
    DB_PORT: int = Field(default=3306, env="DB_PORT")
    DB_USER: str = Field(default="root", env="DB_USER")
    DB_PASSWORD: SecretStr = Field(..., env="DB_PASSWORD")
    DB_NAME: str = Field(default="indumine_db", env="DB_NAME")

    @property
    def DATABASE_URL(self) -> str:
        return (
            f"mysql+pymysql://{self.DB_USER}:{self.DB_PASSWORD.get_secret_value()}"
            f"@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
            f"?charset=utf8mb4"
        )
    
    model_config = {
        "env_file": ".env",
        "case_sensitive": True
    }

settings = Settings()

engine = create_engine(settings.DATABASE_URL)
Base = declarative_base()

raw_data = pd.read_csv('../data/weg_products_final copy.csv', sep=',', encoding='utf-8', low_memory=False)
raw_data.head()

C:\Users\Deyvi\AppData\Local\Temp\ipykernel_44884\2560668656.py:16: PydanticDeprecatedSince20: Using extra keyword arguments on `Field` is deprecated and will be removed. Use `json_schema_extra` instead. (Extra keys: 'env'). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  DB_HOST: str = Field(default="localhost", env="DB_HOST")
C:\Users\Deyvi\AppData\Local\Temp\ipykernel_44884\2560668656.py:17: PydanticDeprecatedSince20: Using extra keyword arguments on `Field` is deprecated and will be removed. Use `json_schema_extra` instead. (Extra keys: 'env'). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  DB_PORT: int = Field(default=3306, env="DB_PORT")
C:\Users\Deyvi\AppData\Local\Temp\ipykernel_44884\2560668656.py:18: PydanticDeprecatedSince20: Using extra keyword arguments on `Field` is deprecated and will be removed. Use `jso

Unnamed: 0,Product URL,Feature,Value
0,https://www.weg.net/catalog/weg/BR/en///CAPACI...,Product Name,CAPACITIVE UNIT UCWT0.5V25 L10 HD
1,https://www.weg.net/catalog/weg/BR/en///CAPACI...,Product Code,10045998
2,https://www.weg.net/catalog/weg/BR/en///CAPACI...,Description,WEG power factor correction capacitors were de...
3,https://www.weg.net/catalog/weg/BR/en///CAPACI...,Complement,Heavy Duty
4,https://www.weg.net/catalog/weg/BR/en///CAPACI...,Reactive power,0.5 kVAr


In [None]:
specs_series = (
    raw_data.drop_duplicates(subset=['Product URL', 'Feature', 'Value'])
    .groupby('Product URL')
    .apply(lambda x: dict(zip(x['Feature'], x['Value'])), include_groups=False)
)

products_df = pd.DataFrame(specs_series).reset_index()
products_df.columns = ['url', 'specs']

def generate_id(row):
    specs = row['specs']
    return specs.get('Product Code', hashlib.md5(row['url'].encode()).hexdigest()[:20])

products_df['id'] = products_df.apply(generate_id, axis=1)

products_df['name'] = products_df['specs'].apply(lambda s: s.get('Product Name', 'Produto sem Nome'))
products_df['description'] = products_df['specs'].apply(lambda s: s.get('Description', ''))

products_df['category'] = products_df['url'].str.extract(r'///([^/]+)')[0]
products_df['category'] = products_df['category'].str.split('-').str[0]

products_df['category'] = products_df['category'].fillna("Geral").replace("", "Geral")

products_df['images'] = "[]"
products_df['scraped_at'] = pd.Timestamp.now().isoformat()

# Organiza colunas
products_df = products_df[['id', 'url', 'name', 'category', 'description', 'specs', 'images', 'scraped_at']]

In [3]:
class Products(Base):
    __tablename__ = 'products'

    id = Column(String(50), primary_key=True)
    url = Column(Text, nullable=False)
    name = Column(String(255), nullable=False)
    category = Column(String(100), nullable=False)
    description = Column(JSON, nullable=True)
    specs = Column(Text, nullable=True)
    images = Column(Text, nullable=True)
    scraped_at = Column(String(50), nullable=False)

def create_tables(engine):
    Base.metadata.create_all(engine)
create_tables(engine)

In [4]:
def mysql_upsert(table_class, engine, df, chunk_size=500):
    df_clean = df.where(pd.notnull(df), None)
    
    total = len(df_clean)
    print(f"Iniciando Upsert de {total} produtos...")

    for i in range(0, total, chunk_size):
        chunk = df_clean.iloc[i : i + chunk_size]
        records = chunk.to_dict(orient='records')
        
        for r in records:
            if isinstance(r['specs'], dict):
                r['specs'] = json.dumps(r['specs'])
            if isinstance(r['description'], (dict, list)):
                r['description'] = json.dumps(r['description'])

        Session = sessionmaker(bind=engine)
        session = Session()
        try:
            stmt = insert(table_class).values(records)
            update_dict = {
                c.name: stmt.inserted[c.name]
                for c in table_class.__table__.columns if not c.primary_key
            }
            upsert_stmt = stmt.on_duplicate_key_update(update_dict)
            session.execute(upsert_stmt)
            session.commit()
            print(f"Lote {i//chunk_size + 1} enviado... ({min(i+chunk_size, total)}/{total})")
        except Exception as e:
            session.rollback()
            print(f"Erro no lote {i}: {e}")
        finally:
            session.close()

# Executar
mysql_upsert(Products, engine, products_df)

Iniciando Upsert de 4051 produtos...
Lote 1 enviado... (500/4051)
Lote 2 enviado... (1000/4051)
Lote 3 enviado... (1500/4051)
Lote 4 enviado... (2000/4051)
Lote 5 enviado... (2500/4051)
Lote 6 enviado... (3000/4051)
Lote 7 enviado... (3500/4051)
Lote 8 enviado... (4000/4051)
Lote 9 enviado... (4051/4051)
