### <center> Limpeza dos dados e Transformação em Parquet</center>

--------

#### Bibliotecas

In [1]:
from pathlib import Path
import pandas as pd
from minio import Minio
from io import BytesIO

#### Variáveis Locais

In [2]:
# Config MinIO
MINIO_ENDPOINT = "localhost:9000"
MINIO_ACCESS_KEY = "minio"
MINIO_SECRET_KEY = "minio123"
BUCKET_NAME = "xau-lake"

In [3]:
# Timeframes que queremos processar
ALLOWED_TIMEFRAMES = {"5m", "15m", "30m", "1h", "4h", "1d"}

#### Conferindo Client

In [4]:
client = Minio(
    MINIO_ENDPOINT,
    access_key=MINIO_ACCESS_KEY,
    secret_key=MINIO_SECRET_KEY,
    secure=False,
)

In [5]:
if not client.bucket_exists(BUCKET_NAME):
    raise RuntimeError(f"Bucket {BUCKET_NAME} não existe.")

print("Conectado ao MinIO e bucket encontrado.")

Conectado ao MinIO e bucket encontrado.


#### Verificando dados no MinIO (Bronze)

In [6]:
bronze_objects = list(
    client.list_objects(BUCKET_NAME, prefix="bronze/", recursive=True)
)

for obj in bronze_objects:
    print(obj.object_name, "-", obj.size, "bytes")

bronze/timeframe=15m/XAU_15m_data.csv - 25264861 bytes
bronze/timeframe=1Month/XAU_1Month_data.csv - 14203 bytes
bronze/timeframe=1d/XAU_1d_data.csv - 293221 bytes
bronze/timeframe=1h/XAU_1h_data.csv - 6469508 bytes
bronze/timeframe=1m/XAU_1m_data.csv - 343143171 bytes
bronze/timeframe=1w/XAU_1w_data.csv - 60328 bytes
bronze/timeframe=30m/XAU_30m_data.csv - 12799617 bytes
bronze/timeframe=4h/XAU_4h_data.csv - 1723220 bytes
bronze/timeframe=5m/XAU_5m_data.csv - 73338363 bytes


#### Dados XAU_1D

##### Carregando os dados e Verificando Tipos

In [7]:
bronze_key_1d = "bronze/timeframe=1d/XAU_1d_data.csv"

In [8]:
response = client.get_object(BUCKET_NAME, bronze_key_1d)
data_bytes = response.read()
response.close()
response.release_conn()

In [9]:
df_1d = pd.read_csv(BytesIO(data_bytes), delimiter=";")

In [10]:
print("Shape:", df_1d.shape)

Shape: (5462, 6)


In [11]:
display(df_1d.head())

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2004.06.11 00:00,384.0,384.8,382.8,384.1,272
1,2004.06.14 00:00,384.3,385.8,381.8,382.8,1902
2,2004.06.15 00:00,382.8,388.8,381.1,388.6,1951
3,2004.06.16 00:00,387.1,389.8,382.6,383.8,2014
4,2004.06.17 00:00,383.6,389.3,383.0,387.6,1568


In [12]:
print("\nDtypes:")
print(df_1d.dtypes)


Dtypes:
Date       object
Open      float64
High      float64
Low       float64
Close     float64
Volume      int64
dtype: object


##### Transformação dos Dados

In [13]:
# Convertendo Coluna Date para Datetime
df_1d["Date"] = pd.to_datetime(
    df_1d["Date"],
    format="%Y.%m.%d %H:%M"
)

In [14]:
# Renomeando colunas visando padronizá-las
df_1d = df_1d.rename(columns={
    "Date": "timestamp",
    "Open": "open",
    "High": "high",
    "Low": "low",
    "Close": "close",
    "Volume": "volume",
})

In [15]:
# Ordenando os dados pelo tempo
df_1d = df_1d.sort_values("timestamp")

In [16]:
# Exibindo o tipo dos dados
print(df_1d.dtypes)

timestamp    datetime64[ns]
open                float64
high                float64
low                 float64
close               float64
volume                int64
dtype: object


In [17]:
# Exibindo uma amostra dos dados
df_1d.head()

Unnamed: 0,timestamp,open,high,low,close,volume
0,2004-06-11,384.0,384.8,382.8,384.1,272
1,2004-06-14,384.3,385.8,381.8,382.8,1902
2,2004-06-15,382.8,388.8,381.1,388.6,1951
3,2004-06-16,387.1,389.8,382.6,383.8,2014
4,2004-06-17,383.6,389.3,383.0,387.6,1568


##### Salvando os Dados em Silver 

In [18]:
# Buffer em memória para o Parquet
buffer = BytesIO()
df_1d.to_parquet(buffer, index=False)
buffer.seek(0)

0

In [19]:
silver_key_1d = "silver/timeframe=1d/XAU_1d_data.parquet"

In [20]:
# Enviando arquivo para camada Silver (MinIO)
client.put_object(
    BUCKET_NAME,
    silver_key_1d,
    data=buffer,
    length=buffer.getbuffer().nbytes,
    content_type="application/octet-stream",
)

ObjectWriteResult(bucket_name='xau-lake', object_name='silver/timeframe=1d/XAU_1d_data.parquet', version_id=None, etag='b374f1f72395ae536c7312a4480fbff6', http_headers=HTTPHeaderDict({'Accept-Ranges': 'bytes', 'Content-Length': '0', 'ETag': '"b374f1f72395ae536c7312a4480fbff6"', 'Server': 'MinIO', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains', 'Vary': 'Origin, Accept-Encoding', 'X-Amz-Id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8', 'X-Amz-Request-Id': '18834EEFCCD04BF4', 'X-Content-Type-Options': 'nosniff', 'X-Ratelimit-Limit': '7037', 'X-Ratelimit-Remaining': '7037', 'X-Xss-Protection': '1; mode=block', 'Date': 'Sun, 21 Dec 2025 18:37:50 GMT'}), last_modified=None, location=None)

In [21]:
print("Arquivo Silver 1d enviado para:", silver_key_1d)

Arquivo Silver 1d enviado para: silver/timeframe=1d/XAU_1d_data.parquet


In [22]:
# Conferindo o Arquivo armazenado
for obj in client.list_objects(BUCKET_NAME, prefix="silver/timeframe=1d/", recursive=True):
    print(obj.object_name, "-", obj.size, "bytes")

silver/timeframe=1d/XAU_1d_data.parquet - 209397 bytes


#### Pipeline para Dados Globais

In [23]:
from io import BytesIO

def bronze_to_silver_timeframe(timeframe: str):
    """
    Lê um CSV bruto (Bronze) para o timeframe informado,
    aplica limpeza padrão e salva em Parquet na camada Silver (MinIO).
    """
    bronze_key = f"bronze/timeframe={timeframe}/XAU_{timeframe}_data.csv"

    # Bronze -> DataFrame
    response = client.get_object(BUCKET_NAME, bronze_key)
    data_bytes = response.read()
    response.close()
    response.release_conn()

    df = pd.read_csv(BytesIO(data_bytes), delimiter=";")

    # Limpeza
    df["Date"] = pd.to_datetime(df["Date"], format="%Y.%m.%d %H:%M")
    df = df.rename(columns={
        "Date": "timestamp",
        "Open": "open",
        "High": "high",
        "Low": "low",
        "Close": "close",
        "Volume": "volume",
    })
    df = df.sort_values("timestamp")
    df["timeframe"] = timeframe

    # DataFrame -> Parquet (Silver)
    buffer = BytesIO()
    df.to_parquet(buffer, index=False)
    buffer.seek(0)

    silver_key = f"silver/timeframe={timeframe}/XAU_{timeframe}_data.parquet"

    client.put_object(
        BUCKET_NAME,
        silver_key,
        data=buffer,
        length=buffer.getbuffer().nbytes,
        content_type="application/octet-stream",
    )

    print(f"[OK] timeframe {timeframe} -> {silver_key}")

In [24]:
# Enviando apenas os dados selecionados inicialmente ao Bucket Silver
ALLOWED_TIMEFRAMES = {"5m", "15m", "30m", "1h", "4h", "1d"}

for tf in ALLOWED_TIMEFRAMES:
    bronze_to_silver_timeframe(tf)

[OK] timeframe 15m -> silver/timeframe=15m/XAU_15m_data.parquet
[OK] timeframe 4h -> silver/timeframe=4h/XAU_4h_data.parquet
[OK] timeframe 5m -> silver/timeframe=5m/XAU_5m_data.parquet
[OK] timeframe 1h -> silver/timeframe=1h/XAU_1h_data.parquet
[OK] timeframe 1d -> silver/timeframe=1d/XAU_1d_data.parquet
[OK] timeframe 30m -> silver/timeframe=30m/XAU_30m_data.parquet


In [25]:
# Conferindo os dados na camada Silver no MinIO
for obj in client.list_objects(BUCKET_NAME, prefix="silver/", recursive=True):
    print(obj.object_name, "-", obj.size, "bytes")

silver/timeframe=15m/XAU_15m_data.parquet - 11077127 bytes
silver/timeframe=1d/XAU_1d_data.parquet - 209920 bytes
silver/timeframe=1h/XAU_1h_data.parquet - 3579644 bytes
silver/timeframe=30m/XAU_30m_data.parquet - 6221704 bytes
silver/timeframe=4h/XAU_4h_data.parquet - 1101679 bytes
silver/timeframe=5m/XAU_5m_data.parquet - 27811153 bytes
