<a href="https://colab.research.google.com/github/Blaydor09/Procesamiento_datos_bid_ask/blob/main/Precesamiento_de_datos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

def merge_order_filter(bid_path, ask_path, output_path):
    """
    Función que:
    - Lee los archivos BID y ASK.
    - Convierte la columna 'Gmt time' a datetime (formato día/mes/año).
    - Renombra las columnas para diferenciarlas.
    - Elimina la columna original 'Gmt time'.
    - Fusiona (inner join) ambos DataFrames por 'timestamp'.
    - Ordena por fecha.
    - Filtra las filas eliminando aquellas en las que volume_bid o volume_ask sean 0.
    - Guarda el resultado en el archivo CSV especificado.
    """
    # Leer los archivos CSV usando Dask
    df_bid = dd.read_csv(bid_path)
    df_ask = dd.read_csv(ask_path)

    # Convertir 'Gmt time' a datetime (formato día/mes/año)
    df_bid['timestamp'] = dd.to_datetime(df_bid['Gmt time'], dayfirst=True)
    df_ask['timestamp'] = dd.to_datetime(df_ask['Gmt time'], dayfirst=True)

    # Renombrar las columnas para BID y ASK
    df_bid = df_bid.rename(columns={
        "Open": "open_bid",
        "High": "high_bid",
        "Low": "low_bid",
        "Close": "close_bid",
        "Volume": "volume_bid"
    })
    df_ask = df_ask.rename(columns={
        "Open": "open_ask",
        "High": "high_ask",
        "Low": "low_ask",
        "Close": "close_ask",
        "Volume": "volume_ask"
    })

    # Eliminar la columna 'Gmt time' ya que usamos 'timestamp'
    df_bid = df_bid.drop("Gmt time", axis=1)
    df_ask = df_ask.drop("Gmt time", axis=1)

    # Unir ambos DataFrames por 'timestamp'
    df_merged = dd.merge(df_bid, df_ask, on="timestamp", how="inner")

    # Ordenar por 'timestamp'
    df_merged = df_merged.set_index("timestamp").reset_index()

    # Filtrar filas donde volume_bid y volume_ask sean diferentes de 0
    df_filtrado = df_merged[(df_merged['volume_bid'] != 0) & (df_merged['volume_ask'] != 0)]

    # Guardar el DataFrame filtrado en un nuevo archivo CSV
    with ProgressBar():
        df_filtrado.compute().to_csv(output_path, index=False)

    print("✅ Datos combinados y filtrados guardados en:", output_path)

#=======================================
# Datos de 15 minutos
#=======================================
bid_path_15 = "/content/EURUSD_Candlestick_15_m_BID_30.12.2021-30.12.2024.csv"
ask_path_15 = "/content/EURUSD_Candlestick_15_m_ASK_30.12.2021-30.12.2024.csv"
output_path_15 = "/content/merged_bid_ask_15_min_filtrado.csv"
merge_order_filter(bid_path_15, ask_path_15, output_path_15)

#=======================================
# Datos de 1 Hora
#=======================================
bid_path_1h = "/content/EURUSD_Candlestick_1_h_BID_30.12.2021-30.12.2024.csv"
ask_path_1h = "/content/EURUSD_Candlestick_1_h_ASK_30.12.2021-30.12.2024.csv"
output_path_1h = "/content/merged_bid_ask_1_hora_filtrado.csv"
merge_order_filter(bid_path_1h, ask_path_1h, output_path_1h)

#=======================================
# Datos de 1 Día
#=======================================
bid_path_1d = "/content/EURUSD_Candlestick_1_D_BID_30.12.2021-30.12.2024.csv"
ask_path_1d = "/content/EURUSD_Candlestick_1_D_ASK_30.12.2021-30.12.2024.csv"
output_path_1d = "/content/merged_bid_ask_1_dia_filtrado.csv"
merge_order_filter(bid_path_1d, ask_path_1d, output_path_1d)


[########################################] | 100% Completed | 1.33 s
✅ Datos combinados y filtrados guardados en: /content/merged_bid_ask_15_min_filtrado.csv
[########################################] | 100% Completed | 389.89 ms
✅ Datos combinados y filtrados guardados en: /content/merged_bid_ask_1_hora_filtrado.csv
[########################################] | 100% Completed | 102.36 ms
✅ Datos combinados y filtrados guardados en: /content/merged_bid_ask_1_dia_filtrado.csv


In [None]:
import pandas as pd

# Cargar el archivo filtrado
file_path = "/content/merged_bid_ask_1_dia_filtrado.csv"
df = pd.read_csv(file_path)

# Verificar valores nulos y valores 0
null_counts = df.isnull().sum()
zero_counts = (df == 0).sum()

print("Valores nulos en cada columna:")
print(null_counts)

print("\nValores 0 en cada columna:")
print(zero_counts)


Valores nulos en cada columna:
timestamp     0
open_bid      0
high_bid      0
low_bid       0
close_bid     0
volume_bid    0
open_ask      0
high_ask      0
low_ask       0
close_ask     0
volume_ask    0
dtype: int64

Valores 0 en cada columna:
timestamp     0
open_bid      0
high_bid      0
low_bid       0
close_bid     0
volume_bid    0
open_ask      0
high_ask      0
low_ask       0
close_ask     0
volume_ask    0
dtype: int64


>Comprobar que no existan datos con valores null o 0

In [None]:
# crear una carpeta
!mkdir -p /content/datos_procesados_merged

In [None]:
#Crear archivo zip
!zip -r /content/datos_procesados_merged_v1.zip /content/datos_procesados_merged

  adding: content/datos_procesados_merged/ (stored 0%)
  adding: content/datos_procesados_merged/merged_bid_ask_15_min_filtrado.csv (deflated 73%)
  adding: content/datos_procesados_merged/merged_bid_ask_1_dia_filtrado.csv (deflated 64%)
  adding: content/datos_procesados_merged/merged_bid_ask_1_hora_filtrado.csv (deflated 70%)
