In [1]:
# Importer les modules nécessaires
import os
import sys
import pandas as pd
from google.cloud import storage

# Ajouter le chemin de base de ton projet au PYTHONPATH
sys.path.append("/root/Trustia/Cicada-binance")

# Importer le EnvController pour la gestion de l'environnement
from src.commons.env_manager.env_controller import EnvController

# Initialiser le EnvController pour charger les configurations
env_controller = EnvController(config_base_path="/root/Trustia/Cicada-binance/config")

# Récupérer le nom du bucket depuis les configurations YAML
bucket_name = env_controller.get_yaml_config('Ta-lib', 'bucket')
print(f"Nom du bucket: {bucket_name}")

# Créer un client Google Cloud Storage
client = storage.Client()

# Accéder au bucket GCS
bucket = client.bucket(bucket_name)


Loading shared env from: /root/Trustia/Cicada-binance/config/shared/.env.shared
Loading environment-specific env from: /root/Trustia/Cicada-binance/config/development/secrets.env
Loading shared YAML config from: /root/Trustia/Cicada-binance/config/shared/db-config.yaml
Loading environment-specific YAML config from: /root/Trustia/Cicada-binance/config/development/app-config.yaml
Loading YAML file: /root/Trustia/Cicada-binance/config/development/app-config.yaml
Nom du bucket: production-trustia-raw-data


In [2]:
def load_gcs_file_to_dataframe(bucket_name, file_path, file_format='parquet'):
    """
    Télécharge un fichier à partir de Google Cloud Storage et le charge dans un DataFrame.

    :param bucket_name: Le nom du bucket GCS.
    :param file_path: Le chemin du fichier dans le bucket.
    :param file_format: Le format du fichier ('parquet', 'csv', etc.).
    :return: DataFrame contenant les données du fichier.
    """
    try:
        # Créer un client GCS
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        
        # Récupérer le blob (fichier) depuis le bucket
        blob = bucket.blob(file_path)
        
        # Créer un fichier temporaire local
        temp_file_path = '/tmp/temp_file.' + file_format
        
        # Télécharger le fichier depuis GCS
        blob.download_to_filename(temp_file_path)
        print(f"Fichier téléchargé depuis GCS : {temp_file_path}")
        
        # Charger le fichier dans un DataFrame
        if file_format == 'parquet':
            df = pd.read_parquet(temp_file_path)
        elif file_format == 'csv':
            df = pd.read_csv(temp_file_path)
        else:
            raise ValueError(f"Format de fichier non supporté : {file_format}")
        
        # Retourner le DataFrame
        return df
    
    except Exception as e:
        print(f"Erreur lors du téléchargement ou du chargement du fichier : {e}")
        return None


In [None]:
def read_parquet_with_pyarrow(self, file_path):
    try:
        table = pq.read_table(file_path)
        df = table.to_pandas()
        df.columns = [
            'open_time', 'open', 'high', 'low', 'close', 'volume',
            'close_time', 'quote_volume', 'count', 'taker_buy_volume',
            'taker_buy_quote_volume', 'ignore'
        ]
        return df
    except Exception as e:
        self.logger.log_error(f"Error reading parquet file with pyarrow: {e}")
        return pd.DataFrame()

# OUTPUT

In [11]:
# Spécifier le chemin du fichier dans le bucket (sans le nom du bucket)
gcs_file_path = "Transformed/cryptos/BTCUSDT/spot/bars/time-bars/1h/data_2020_01_01_2024_10_21.parquet"  # Remplace par le chemin correct du fichier

# Charger le fichier GCS dans un DataFrame
df = load_gcs_file_to_dataframe(bucket_name, gcs_file_path, file_format='parquet')

# Afficher le DataFrame
if df is not None:
    print("DataFrame chargé avec succès :")
    display(df)
else:
    print("Le DataFrame n'a pas pu être chargé.")


Fichier téléchargé depuis GCS : /tmp/temp_file.parquet
DataFrame chargé avec succès :


Unnamed: 0,timestamp,open,high,low,close,volume,EMA_20,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,...,DMP_14,DMN_14,ATRr_14,CCI_20_0.015,ROC_12,MFI_14,TRIX_15_9,TRIXs_15_9,OBV,WILLR_14
0,2020-01-01 01:00:00,7176.47,7230.00,7175.71,7216.27,883.052603,,,,,...,,,,,,,,,883.052603,
1,2020-01-01 02:00:00,7215.52,7244.87,7211.41,7242.85,655.156809,,,,,...,,,,,,,,,1538.209412,
2,2020-01-01 03:00:00,7242.66,7245.00,7220.00,7225.01,783.724867,,,,,...,,,,,,,,,754.484545,
3,2020-01-01 04:00:00,7225.00,7230.00,7215.03,7217.27,467.812578,,,,,...,,,,,,,,,286.671967,
4,2020-01-01 05:00:00,7217.26,7229.76,7216.65,7224.21,344.670596,,,,,...,,,,,,,,,631.342563,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8039,2024-10-21 19:00:00,67335.05,67941.17,67335.01,67763.00,1240.189220,67783.277242,-65.231925,-111.711203,46.479278,...,23.944267,27.189111,477.751510,-51.467134,-1.096126,23.133222,0.014392,0.042910,94759.539398,-60.000260
8040,2024-10-21 20:00:00,67763.99,67818.19,67624.18,67749.99,494.387090,67780.107028,-57.468619,-83.158317,25.689699,...,23.218960,26.365513,457.484259,-49.342875,-0.807406,25.410660,0.009287,0.036779,94265.152308,-53.589074
8041,2024-10-21 21:00:00,67750.00,67933.79,67688.14,67803.21,389.907190,67782.307311,-46.485872,-57.740457,11.254584,...,24.164595,25.319696,442.353241,-34.104910,-0.840607,27.555608,0.005376,0.030294,94655.059498,-47.122774
8042,2024-10-21 22:00:00,67803.21,67878.78,67602.84,67781.07,544.894630,67782.189472,-39.117557,-40.297713,1.180156,...,23.058158,25.575778,430.466581,-35.189374,-0.732878,28.146528,0.002358,0.023941,94110.164868,-42.735355


In [10]:
# Spécifier le chemin du fichier dans le bucket (sans le nom du bucket)
gcs_file_path = "Transformed/cryptos/BTCUSDT/spot/bars/time-bars/30m/data_2023_01_01_2024_10_21.parquet"  # Remplace par le chemin correct du fichier

# Charger le fichier GCS dans un DataFrame
df = load_gcs_file_to_dataframe(bucket_name, gcs_file_path, file_format='parquet')

# Afficher le DataFrame
if df is not None:
    print("DataFrame chargé avec succès :")
    display(df)
else:
    print("Le DataFrame n'a pas pu être chargé.")


Fichier téléchargé depuis GCS : /tmp/temp_file.parquet
DataFrame chargé avec succès :


Unnamed: 0,timestamp,open,high,low,close,volume,EMA_20,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,...,DMP_14,DMN_14,ATRr_14,CCI_20_0.015,ROC_12,MFI_14,TRIX_15_9,TRIXs_15_9,OBV,WILLR_14
0,2023-01-01 00:30:00,16544.19,16544.61,16508.39,16529.67,2089.66847,,,,,...,,,,,,,,,2089.668470,
1,2023-01-01 01:00:00,16529.59,16541.80,16525.78,16534.60,1638.93201,,,,,...,,,,,,,,,3728.600480,
2,2023-01-01 01:30:00,16534.02,16556.80,16530.62,16551.47,1951.13468,,,,,...,,,,,,,,,5679.735160,
3,2023-01-01 02:00:00,16551.47,16555.00,16539.57,16552.89,1707.51457,,,,,...,,,,,,,,,7387.249730,
4,2023-01-01 02:30:00,16552.89,16559.77,16538.14,16548.19,1611.32581,,,,,...,,,,,,,,,5775.923920,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31013,2024-10-21 21:30:00,67861.35,67861.35,67688.14,67803.21,156.75273,67711.666639,-138.954027,98.724404,-237.678431,...,21.335092,23.613976,296.417441,57.340433,0.686342,61.930315,-0.056124,-0.069675,467192.797448,-12.536120
31014,2024-10-21 22:00:00,67803.21,67878.78,67700.00,67770.01,262.89783,67717.223150,-119.096630,94.865440,-213.962070,...,20.821407,22.566979,288.014767,68.886325,1.170394,70.844404,-0.051052,-0.067146,466929.899618,-15.678444
31015,2024-10-21 22:30:00,67770.00,67805.91,67602.84,67781.07,281.99680,67723.303802,-101.299335,90.130189,-191.429523,...,19.750233,23.867456,281.947283,67.243194,0.674425,62.062531,-0.046066,-0.063884,467211.896418,-15.288826
31016,2024-10-21 23:00:00,67781.07,67781.08,67429.86,67513.77,382.65678,67703.348202,-107.524253,67.124216,-174.648469,...,18.023203,26.087099,286.895334,30.673589,0.050030,65.971745,-0.042029,-0.060089,466829.239638,-40.814767


In [5]:
df.columns

Index(['timestamp', 'open', 'high', 'low', 'close', 'volume', 'EMA_20',
       'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 'SMA_50', 'WMA_30',
       'STOCHk_14_3_3', 'STOCHd_14_3_3', 'ADX_14', 'DMP_14', 'DMN_14',
       'ATRr_14', 'CCI_20_0.015', 'ROC_12', 'MFI_14', 'TRIX_15_9',
       'TRIXs_15_9', 'OBV', 'WILLR_14'],
      dtype='object')

# INPUT

In [6]:
# Spécifier le chemin du fichier dans le bucket (sans le nom du bucket)
gcs_file_path = "Raw/binance-data-vision/historical/BTCUSDT/futures/klines/15m/2024/10/21/data.parquet"  # Remplace par le chemin correct du fichier

# Charger le fichier GCS dans un DataFrame
df = load_gcs_file_to_dataframe(bucket_name, gcs_file_path, file_format='parquet')

# Afficher le DataFrame
if df is not None:
    print("DataFrame chargé avec succès :")
    display(df)
else:
    print("Le DataFrame n'a pas pu être chargé.")

Fichier téléchargé depuis GCS : /tmp/temp_file.parquet
DataFrame chargé avec succès :


Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_volume,count,taker_buy_volume,taker_buy_quote_volume,ignore
0,1729468800000,69020.2,69125.0,68957.2,69124.9,1453.355,1729469699999,1.003387e+08,26153,737.136,5.089169e+07,0
1,1729469700000,69124.9,69566.0,69124.9,69400.3,7388.546,1729470599999,5.127013e+08,85677,4163.723,2.889364e+08,0
2,1729470600000,69400.3,69555.0,69233.0,69254.7,2546.768,1729471499999,1.766399e+08,43848,1226.380,8.507234e+07,0
3,1729471500000,69254.7,69254.7,69055.0,69153.9,2292.736,1729472399999,1.585320e+08,31378,902.957,6.243258e+07,0
4,1729472400000,69153.9,69287.5,69091.2,69283.0,1537.888,1729473299999,1.063843e+08,24784,912.562,6.313607e+07,0
...,...,...,...,...,...,...,...,...,...,...,...,...
91,1729550700000,67674.5,67793.4,67674.4,67769.9,1320.284,1729551599999,8.944530e+07,19473,888.448,6.018838e+07,0
92,1729551600000,67769.9,67770.0,67408.7,67530.5,4214.968,1729552499999,2.847166e+08,42836,1705.069,1.151686e+08,0
93,1729552500000,67530.4,67574.1,67475.3,67489.9,1020.234,1729553399999,6.888789e+07,18723,567.303,3.830541e+07,0
94,1729553400000,67489.9,67521.1,67400.0,67513.8,1077.977,1729554299999,7.272442e+07,20574,583.887,3.939345e+07,0
