In [7]:
import pandas as pd
import logging
import yaml
import os

In [None]:
# TODO: Split up into separate functions

# TODO: Uncomment when the function is transfered to the source file
# # Make sure the directory 'etl' exists in the logs directory. If the directory does not exist, create it
# if not os.path.exists('../logs/etl'):
#     os.makedirs('../logs/etl')

# Set up the logging configuration, for the ETL load process
logging.basicConfig(
    # Set the logging level to INFO, only messages with a level of INFO or higher will be displayed
    level=logging.INFO,
    # Set the format of the log messages, the format will be: 'timestamp - logger name - log level - log message'
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',

    # TODO: Set to relative path
    # Set the filename of the log file
    filename='/Users/akram/DataScienceProjects/customer-churn-prediction/logs/etl/extract.log',
    # 'w' mode will overwrite the log file every time the script is run, 'a' mode will append the log messages to the log file
    filemode='w'
)

def extract_data(source_file_path, config_file_path):
    """
    Extracts the data from a source file, verifies if the file is in csv format, and checks if it contains the expected columns. 
    Otherwise it exits with an exception.

    @param 
    - file_path (str): The path to the file to extract
    -config_file_path (str): The path to the configuration file, which contains the required columns
    
    @return pd.DataFrame: The data extracted from the file in a dataframe

    @exceptions:
    - ValueError: If the file is not a csv file
    - FileNotFoundError: If the file is not found
    - FileNotFoundError: If the configuration file is not found
    - ValueError: If the data is missing required columns
    """
    # Log a message that the extract process has started
    logging.info(f"Starting data extract process for: {source_file_path}")

    # Check if the file is a csv file, if not a csv file, set an error message and raise a ValueError
    if not source_file_path.endswith('.csv'):
        # Log an error message
        logging.error(f"Invalid file type for file: {source_file_path}. Expected a csv file.")
        # Raise a ValueError
        raise ValueError(f"Invalid file type for file: {source_file_path}. Expected a csv file.")
        
    else:
        # Try to read the file
        try:
            # Load the data into a pandas DataFrame
            data = pd.read_csv(source_file_path)
            # Log a message that the data was successfully loaded
            logging.info(f"Data succesfully extracted from: {source_file_path}")

        # If the file is not found, catch the FileNotFoundError exception
        except FileNotFoundError:
            # Log an error message
            logging.error(f"File not found: {source_file_path}")
            # Raise a FileNotFoundError
            raise FileNotFoundError(f"File not found: {source_file_path}")
            
    # Load the YAML configuration, which contains the required columns
    # Try to open the configuration file
    try:
        # Open the configuration file, in read mode, the path is hardcoded since the same configuration file is used for all ETL processes
        with open(config_file_path, 'r') as file:
            # Load the YAML file, and store it in a dictionary
            etl_extract_required_columns = yaml.safe_load(file)
        # Access required columns
        required_columns = etl_extract_required_columns['columns']['required_columns_to_load']
        # Log a message that the required columns were successfully loaded
        logging.info(f'Required columns succesfully extracted from configuration file.')
    
    # If the configuration file is not found, catch the FileNotFoundError exception
    except FileNotFoundError:
        # Log an error message
        logging.error(f"Configuration file not found: {config_file_path}")
        # Raise a FileNotFoundError
        raise FileNotFoundError(f"Configuration file not found: {config_file_path}")

    # Validate if the data contains all the required 
    missing_columns = [col for col in required_columns if col not in data.columns]
    if missing_columns:
        logging.error(f"The file is missing required columns: {missing_columns}")
        raise ValueError(f"Schema validation failed. Missing columns: {missing_columns}")

    # Log a successful schema validation
    logging.info("Schema validation passed. All required columns are present.")

    # Return the data
    return data

In [13]:
data = extract_data('/Users/akram/DataScienceProjects/customer-churn-prediction/data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv', 
                    '/Users/akram/DataScienceProjects/customer-churn-prediction/configuration/etl/extract/required_columns.yaml')