### <center> Limpeza dos dados e Transformação em Parquet</center>

--------

#### Bibliotecas

In [1]:
from pathlib import Path
import pandas as pd

#### Variáveis Locais

In [2]:
# Variáveis RAW e PARQUET
RAW_DIR = Path("..") / "data" / "raw"
PARQUET_DIR = Path("..") / "data" / "parquet"
PARQUET_DIR.mkdir(parents=True, exist_ok=True)

#### Conferindo Caminhos

In [3]:
RAW_DIR

PosixPath('../data/raw')

In [4]:
PARQUET_DIR

PosixPath('../data/parquet')

#### Dados XAU_15m

##### Carregando os dados e Verificando Tipos

In [5]:
csv_15m = RAW_DIR / "XAU_15m_data.csv"
df_15m = pd.read_csv(csv_15m, delimiter=";")

In [6]:
print("Shape:", df_15m.shape)

Shape: (485968, 6)


In [7]:
display(df_15m.head())

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2004.06.11 07:15,384.0,384.3,383.8,384.3,12
1,2004.06.11 07:30,383.8,384.3,383.6,383.8,12
2,2004.06.11 07:45,383.3,383.8,383.3,383.8,20
3,2004.06.11 08:00,383.8,384.1,383.6,383.6,8
4,2004.06.11 08:15,383.6,384.3,383.5,383.5,20


In [8]:
print("\nDtypes:")
print(df_15m.dtypes)


Dtypes:
Date       object
Open      float64
High      float64
Low       float64
Close     float64
Volume      int64
dtype: object


##### Transformação dos Dados

In [9]:
# Convertendo Coluna Date para Datetime
df_15m["Date"] = pd.to_datetime(
    df_15m["Date"],
    format="%Y.%m.%d %H:%M"
)

In [10]:
# Renomeando colunas visando padronizá-las
df_15m = df_15m.rename(columns={
    "Date": "timestamp",
    "Open": "open",
    "High": "high",
    "Low": "low",
    "Close": "close",
    "Volume": "volume",
})

In [11]:
# Ordenando os dados pelo tempo
df_15m = df_15m.sort_values("timestamp")

In [12]:
# Exibindo o tipo dos dados
print(df_15m.dtypes)

timestamp    datetime64[ns]
open                float64
high                float64
low                 float64
close               float64
volume                int64
dtype: object


In [13]:
# Exibindo uma amostra dos dados
df_15m.head()

Unnamed: 0,timestamp,open,high,low,close,volume
0,2004-06-11 07:15:00,384.0,384.3,383.8,384.3,12
1,2004-06-11 07:30:00,383.8,384.3,383.6,383.8,12
2,2004-06-11 07:45:00,383.3,383.8,383.3,383.8,20
3,2004-06-11 08:00:00,383.8,384.1,383.6,383.6,8
4,2004-06-11 08:15:00,383.6,384.3,383.5,383.5,20


##### Salvando os Dados em Parquet 

In [14]:
parquet_15m = PARQUET_DIR / "XAU_15m_data.parquet"

df_15m.to_parquet(parquet_15m, index=False)

print("Arquivo salvo em:", parquet_15m)

Arquivo salvo em: ../data/parquet/XAU_15m_data.parquet


In [16]:
# Teste para validar a qualidade dos Dados
df_15m_check = pd.read_parquet(parquet_15m)

In [17]:
print("Shape:", df_15m_check.shape)

Shape: (485968, 6)


In [18]:
print(df_15m_check.dtypes)

timestamp    datetime64[ns]
open                float64
high                float64
low                 float64
close               float64
volume                int64
dtype: object


In [19]:
df_15m_check.head()

Unnamed: 0,timestamp,open,high,low,close,volume
0,2004-06-11 07:15:00,384.0,384.3,383.8,384.3,12
1,2004-06-11 07:30:00,383.8,384.3,383.6,383.8,12
2,2004-06-11 07:45:00,383.3,383.8,383.3,383.8,20
3,2004-06-11 08:00:00,383.8,384.1,383.6,383.6,8
4,2004-06-11 08:15:00,383.6,384.3,383.5,383.5,20


#### Dados Globais

In [20]:
def clean_xau_csv(csv_path: Path, parquet_dir: Path) -> Path:
    """
    Lê um CSV XAU_*.csv, faz limpeza padrão e salva em Parquet.
    Retorna o caminho do arquivo Parquet criado.
    """
    df = pd.read_csv(csv_path, delimiter=";")

    # Converte data/hora
    df["Date"] = pd.to_datetime(df["Date"], format="%Y.%m.%d %H:%M")

    # Renomeia colunas
    df = df.rename(columns={
        "Date": "timestamp",
        "Open": "open",
        "High": "high",
        "Low": "low",
        "Close": "close",
        "Volume": "volume",
    })

    # Ordena
    df = df.sort_values("timestamp")

    # Deriva timeframe a partir do nome, ex.: XAU_15m_data.csv -> 15m
    fname = csv_path.name  # ex.: "XAU_15m_data.csv"
    timeframe = fname.split("_")[1]   # pega "15m"
    df["timeframe"] = timeframe

    # Caminho do parquet de saída
    parquet_path = parquet_dir / fname.replace(".csv", ".parquet")

    df.to_parquet(parquet_path, index=False)
    print(f"Salvo: {parquet_path}")

    return parquet_path

In [21]:
allowed_timeframes = {"5m", "15m", "30m", "1h", "4h", "1d"}

for csv_path in RAW_DIR.glob("XAU_*_data.csv"):
    parts = csv_path.stem.split("_")   # ex.: ["XAU", "15m", "data"]
    timeframe = parts[1]

    if timeframe not in allowed_timeframes:
        print(f"Ignorando {csv_path.name} (timeframe {timeframe} não está na lista).")
        continue

    print(f"Processando {csv_path.name} (timeframe {timeframe})...")
    clean_xau_csv(csv_path, PARQUET_DIR)

Processando XAU_1h_data.csv (timeframe 1h)...
Salvo: ../data/parquet/XAU_1h_data.parquet
Processando XAU_30m_data.csv (timeframe 30m)...
Salvo: ../data/parquet/XAU_30m_data.parquet
Ignorando XAU_1Month_data.csv (timeframe 1Month não está na lista).
Processando XAU_4h_data.csv (timeframe 4h)...
Salvo: ../data/parquet/XAU_4h_data.parquet
Ignorando XAU_1m_data.csv (timeframe 1m não está na lista).
Processando XAU_5m_data.csv (timeframe 5m)...
Salvo: ../data/parquet/XAU_5m_data.parquet
Ignorando XAU_1w_data.csv (timeframe 1w não está na lista).
Processando XAU_15m_data.csv (timeframe 15m)...
Salvo: ../data/parquet/XAU_15m_data.parquet
Processando XAU_1d_data.csv (timeframe 1d)...
Salvo: ../data/parquet/XAU_1d_data.parquet


In [22]:
list(PARQUET_DIR.iterdir())

[PosixPath('../data/parquet/XAU_30m_data.parquet'),
 PosixPath('../data/parquet/XAU_4h_data.parquet'),
 PosixPath('../data/parquet/XAU_5m_data.parquet'),
 PosixPath('../data/parquet/XAU_15m_data.parquet'),
 PosixPath('../data/parquet/XAU_1h_data.parquet'),
 PosixPath('../data/parquet/XAU_1d_data.parquet')]