In [None]:
# Automatically install the necessary libraries
!pip install pandas selenium pyarrow google-cloud-storage google-cloud-bigquery



In [None]:
# Importing the necessary libraries
try:
    import pandas as pd
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    import os
    import pyarrow as pa
    import pyarrow.parquet as pq
    import time
    from datetime import datetime
    from google.cloud import storage
    from google.cloud import bigquery
    from google.oauth2 import service_account

    print("Libraries imported correctly.")
except ImportError as e:
    print(f"Error importing libraries: {e}")

In [None]:
# Function to configure the WebDriver
def setup_driver():
    """
    Configures and returns an instance of Chrome WebDriver in headless mode.
    """
    try:
        # Configures Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run Chrome in headless mode
        chrome_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration (optional)
        chrome_options.add_argument("--no-sandbox")  # Disable sandboxing (optional)
        prefs = {
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True
        }
        chrome_options.add_experimental_option("prefs", prefs)

        # Create and return the driver with the configured options
        driver = webdriver.Chrome(options=chrome_options)
        print("Driver configured in headless mode.")
        return driver
    except Exception as e:
        print(f"Error in driver configuration: {e}")

In [None]:
def get_csv_data(driver):
    """
    Download the CSV file from the web page and save it as a Parquet file.
    """
    try:
        # Visit the page
        driver.get("https://live.euronext.com/en/markets/milan/equities/list")
        time.sleep(5)  # Wait for the page to load

        # accept the cookie
        try:
            cookie = driver.find_element(By.ID, "onetrust-accept-btn-handler")
            cookie.click()
            print("Cookie accepted.")
        except Exception as e:
            print(f"Error accepting cookies: {e}")
            return

        # Find and click the button to download the file
        try:
            download = driver.find_element(By.XPATH, "/html/body/div[2]/div[1]/div/div/div[1]/div[3]/div/main/section/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/button")
            download.click()
            time.sleep(2)  # Wait for the download menu to open
            print("Download button clicked.")
        except Exception as e:
            print(f"Error clicking the download button: {e}")
            return

        try:
            # Select CSV as the download format
            down_csv = driver.find_element(By.XPATH, "/html/body/div[7]/div/div/div[2]/fieldset[1]/div[2]/label")
            down_csv.click()
            print("CSV format selected.")
        except Exception as e:
            print(f"Error selecting the CSV format:: {e}")
            return

        try:
            # Click to start the download
            down_file = driver.find_element(By.XPATH, "/html/body/div[7]/div/div/div[2]/input")
            down_file.click()
            print("Download started.")
        except Exception as e:
            print(f"Error starting the download: {e}")
            return

        # Wait for the file to download
        time.sleep(5)

        # Determine the download folder based on the operating system
        if os.name == 'nt':  # Windows
            download_dir = os.path.join(os.path.expanduser("~"), "Downloads")
        elif os.name == 'posix':  # macOS o Linux
            download_dir = os.path.join(os.path.expanduser("~"), "Downloads")

        # Search for the most recent file in the download folder
        try:
            downloaded_files = os.listdir(download_dir)
            downloaded_files = [os.path.join(download_dir, f) for f in downloaded_files]
            latest_file = max(downloaded_files, key=os.path.getctime)  # Trova il file più recente
            print(f"Downloaded file found: {latest_file}")
            return latest_file
        except Exception as e:
            print(f"Error searching for the downloaded file: {e}")
            return


    except Exception as e:
        print(f"General error during file download: {e}")

In [None]:
def csv_to_df(latest_file):
    try:
        # Check if the file is a CSV and load it into a DataFrame
        df = pd.read_csv(latest_file, sep=";",skiprows=[1,2,3])
        print(f"CSV file loaded successfully: {latest_file}")
        return df
    except Exception as e:
        print(f"Error loading the CSV file: {e}")

def clean_numeric_columns(df):
    # List of columns that might contain apostrophes
    columns = ["Open Price", "High Price", "low Price", "last Price",
               "last Trade MIC Time", "Volume", "Turnover", "Closing Price", "Closing Price DateTime"]
    
    # Loop through each column in the list
    for column in columns:
        # Replace apostrophes with an empty string using regex
        df[column] = df[column].replace("'", "", regex=True)
    
    return df

def convert_numeric_col(df):
    colum = ["Open Price","High Price","low Price","last Price","Turnover","Closing Price"]
    for col in colum:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df["Volume"] = pd.to_numeric(df["Volume"], errors='coerce')
    df['Volume'] = df['Volume'].fillna(0).astype(int)
    return df

def convert_datetime_col(df):    
    df["last Trade MIC Time"] = pd.to_datetime(df["last Trade MIC Time"],format='%d/%m/%Y %H:%M', errors='coerce')
    df['last Trade MIC Time'] = df['last Trade MIC Time'].values.astype('datetime64[us]')
    df['Closing Price DateTime'] = pd.to_datetime(df['Closing Price DateTime'], format='%d/%m/%Y', errors='coerce')
    df['Closing Price DateTime'] = df['Closing Price DateTime'].values.astype('datetime64[us]')
    return df

def fix_col(df):
    try:
        clean_numeric_columns(df)
        convert_numeric_col(df)
        convert_datetime_col(df)
        print("Columns Fixed")
        return df
    except Exception as e:
        print(f"Error during fixing col: {e}")
        return None
    
    
def add_scarping_detail_and_convert_df_to_parquet(df):
    """
    Adds metadata columns to the DataFrame.
    """
    try:
        df['scraping_timestamp'] = datetime.now()
        df['scraping_url'] = "https://live.euronext.com/en/markets/milan/equities/list"
        print("Metadata columns added.")
    except Exception as e:
        print(f"Error adding metadata columns: {e}")
    try:
        # Determine the download folder based on the operating system
        if os.name == 'nt':  # Windows
            download_dir = os.path.join(os.path.expanduser("~"), "Downloads")
        elif os.name == 'posix':  # macOS o Linux
            download_dir = os.path.join(os.path.expanduser("~"), "Downloads")


        # Convert the DataFrame to Parquet format
        table = pa.Table.from_pandas(df)
        parquet_file = os.path.join(download_dir, 'financial_data.parquet')
        pq.write_table(table, parquet_file)
        print(f"Parquet file saved as: {parquet_file}")
    except:
        print("Error during convert the file to parquet")




In [None]:
# Execute the function

# Configure and get the driver
driver = setup_driver()

# Run the function to download and get csv
latest_file = get_csv_data(driver)

# save df file to parquet with scraping metadata
df = csv_to_df(latest_file)
df = fix_col(df)
add_scarping_detail_and_convert_df_to_parquet(df)

In [None]:
def upload_to_gcs(bucket_name,source_file_name,destination_blob_name,credential_path):
    """
    Uploads a local Parquet file to a GCS bucket.

    :param bucket_name: The name of the GCS bucket.
    :param source_file_name: The local path to the Parquet file to be uploaded.
    :param destination_blob_name: The name for the file in the GCS bucket.
    """
    try:
        # Create a GCS client
        storage_client = storage.Client.from_service_account_json(credential_path)

        # Get the GCS bucket
        bucket = storage_client.bucket(bucket_name)

        # Create a blob (an object in the GCS bucket) and upload the file
        blob = bucket.blob(destination_blob_name)
        blob.upload_from_filename(source_file_name)

        print(f"File {source_file_name} successfully uploaded as {destination_blob_name} to bucket {bucket_name}.")
    except Exception as e:
        print(f"Error uploading to GCS: {e}")


In [None]:
bucket_name = 'parquet-dataset-financial-data'  # GCS bucket name
source_file_name = '/Users/danifila/Downloads/financial_data.parquet'  # Local path to the Parquet file
destination_blob_name = 'main_financial_data.parquet'  # Desired name for the file in the GCS bucket
credentials_path = '/Users/danifila/Desktop/UpWork/dani-financial-1ca621e0a4c6.json'  # Path to the JSON credentials file


upload_to_gcs(bucket_name,source_file_name,destination_blob_name,credentials_path)

In [None]:
def load_parquet_to_bigquery(dataset_id, table_id, gcs_uri, credentials_path):
    """
    Loads a Parquet file from GCS to a table in BigQuery.

    :param dataset_id: ID of the BigQuery dataset.
    :param table_id: ID of the BigQuery table.
    :param gcs_uri: URI of the Parquet file in GCS (e.g., gs://bucket-name/file-name.parquet).
    :param credentials_path: Path to the JSON credentials file.
    """
    try:
        # Create a BigQuery client with specific credentials
        client = bigquery.Client.from_service_account_json(credentials_path)

        # Configure the load job
        job_config = bigquery.LoadJobConfig(
            source_format=bigquery.SourceFormat.PARQUET,  # Specify the file format
            write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE # Overwrite the existing table
        )

        # Define the reference to the BigQuery table
        table_ref = f"{dataset_id}.{table_id}"

        # Load data from GCS to BigQuery
        load_job = client.load_table_from_uri(
            gcs_uri,  # URI of the Parquet file in GCS
            table_ref,  # Reference to the BigQuery table
            job_config=job_config
        )

        # Wait for the job to complete
        load_job.result()

        print(f"Loaded {load_job.output_rows} rows into table {table_ref}.")
    except Exception as e:
        print(f"Error during loading to BigQuery: {e}")

In [None]:
dataset_id = 'financial_data'  # ID of the BigQuery dataset
table_id = 'stock_prices'  # ID of the BigQuery table
gcs_uri = 'gs://parquet-dataset-financial-data/main_financial_data.parquet'  # URI of the Parquet file in GCS
credentials_path = '/Users/danifila/Desktop/UpWork/dani-financial-1ca621e0a4c6.json'  # Path to the JSON credentials file

load_parquet_to_bigquery(dataset_id, table_id, gcs_uri, credentials_path)