In [None]:
import pandas as pd
import logging
import yaml
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [20]:
# TODO: Uncomment when the function is transfered to the source file
# # Make sure the directory 'etl' exists in the logs directory. If the directory does not exist, create it
# if not os.path.exists('../logs/etl'):
#     os.makedirs('../logs/etl')

# Set up the logging configuration, for the ETL load process
logging.basicConfig(
    # Set the logging level to INFO, only messages with a level of INFO or higher will be displayed
    level=logging.INFO,
    # Set the format of the log messages, the format will be: 'timestamp - logger name - log level - log message'
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',

    # TODO: Set to relative path
    # Set the filename of the log file
    filename='/Users/akram/DataScienceProjects/customer-churn-prediction/logs/etl/extract.log',
    # 'w' mode will overwrite the log file every time the script is run, 'a' mode will append the log messages to the log file
    filemode='w'
)

def extract_data(source_file_path, config_file_path):
    """
    Extracts the data from a source file, verifies if the file is in csv format, and checks if it contains the expected columns. 
    Otherwise it exits with an error message.

    @param file_path (str): The path to the file to extract
    @param config_file_path (str): The path to the configuration file, which contains the required columns
    @return pd.DataFrame: The data extracted from the file in a dataframe
    """
    # Log a message that the extract process has started
    logging.info(f"Starting data extract process for: {source_file_path}")

    # Check if the file is a csv file, if not a csv file, set an error message and raise a ValueError
    if not source_file_path.endswith('.csv'):
        # Log an error message
        logging.error(f"Invalid file type for file: {source_file_path}. Expected a csv file.")
        # Raise a ValueError
        raise ValueError(f"Invalid file type. Expected a csv file.")
        
    else:
        # Try to read the file
        try:
            # Load the data into a pandas DataFrame
            data = pd.read_csv(source_file_path)
            # Log a message that the data was successfully loaded
            logging.info(f"Data succesfully extracted from: {source_file_path}")

        # If the file is not found, catch the FileNotFoundError exception
        except FileNotFoundError:
            # Log an error message
            logging.error(f"File not found: {source_file_path}")
            # Raise a FileNotFoundError
            raise FileNotFoundError(f"File not found: {source_file_path}")
            
    # Load the YAML configuration, which contains the required columns
    # Try to open the configuration file
    try:
        # Open the configuration file, in read mode, the path is hardcoded since the same configuration file is used for all ETL processes
        with open(config_file_path, 'r') as file:
            # Load the YAML file, and store it in a dictionary
            etl_extract_required_columns = yaml.safe_load(file)
        # Access required columns
        required_columns = etl_extract_required_columns['columns']['required_columns_to_load']
        # Log a message that the required columns were successfully loaded
        logging.info(f'Required columns succesfully extracted from configuration file.')
    
    # If the configuration file is not found, catch the FileNotFoundError exception
    except FileNotFoundError:
        # Log an error message
        logging.error(f"Configuration file not found: {config_file_path}")
        # Raise a FileNotFoundError
        raise FileNotFoundError(f"Configuration file not found: {config_file_path}")

    # Validate if the data contains all the required 
    missing_columns = [col for col in required_columns if col not in data.columns]
    if missing_columns:
        logging.error(f"The file is missing required columns: {missing_columns}")
        raise ValueError(f"Schema validation failed. Missing columns: {missing_columns}")

    # Log a successful schema validation
    logging.info("Schema validation passed. All required columns are present.")

    # Return the data
    return data

In [21]:
data = extract_data('/Users/akram/DataScienceProjects/customer-churn-prediction/data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv', 
                    '/Users/akram/DataScienceProjects/customer-churn-prediction/configuration/etl/data_columns.yaml')

In [13]:
# All the types of internet service
data['InternetService'].unique()

array(['DSL', 'Fiber optic', 'No'], dtype=object)

In [14]:
# Don't truncate the data when displaying it, both the columns and rows will be displayed
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
data.head(7)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No


In [16]:
# STD is a measure of the amount of variation or dispersion of a set of values. A low standard deviation means that most of the numbers are close to the average. 
# A high standard deviation means that the numbers are spread out.
#
# 25% of the data is below the first quartile, 50% is below the median, and 75% is below the third quartile

data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [201]:
# Normalize the columns 'tenure', 'MonthlyCharges', and 'TotalCharges'
# Normalization is chosen over standardization because the data's difference is meaningful
scaler = MinMaxScaler()
cols_to_normalize = ['tenure', 'MonthlyCharges', 'TotalCharges']
data[cols_to_normalize] = scaler.fit_transform(data[cols_to_normalize])

In [96]:
# Set up the logging configuration for ETL transform process
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    # TODO: Set to relative path
    filename='/Users/akram/DataScienceProjects/customer-churn-prediction/logs/etl/transform.log',
    filemode='w'
)

def normalize(data, columns_to_normalize):
    """
    Normalizes the columns in the data, using the MinMaxScaler
    Normalization is chosen over standardization because the data's difference is meaningful 
    and the data is not normally distributed. 

    @param data (pd.DataFrame): The data to normalize
    @param columns_to_normalize (list): The columns to normalize
    @return pd.DataFrame: The data with the normalized columns
    """
    # Log a message that the normalization process has started
    logging.info("Starting normalization process")

    # Initialize the MinMaxScaler
    scaler = MinMaxScaler()
    # Normalize the columns
    data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])

    # Log a message that the normalization process has ended
    logging.info("Normalization process completed")

    # Return the data with the normalized columns
    return data

def label_encode(data, columns_to_label_encode):
    """
    Label encodes the columns in the data, using the LabelEncoder.
    Label encoding is used for columns that are ordinal, meaning the data has a meaningful order.

    @param data (pd.DataFrame): The data to label encode
    @param columns_to_label_encode (list): The columns to label encode
    @return pd.DataFrame: The data with the label encoded columns
    """
    # Log a message that the label encoding process has started
    logging.info("Starting label encoding process")

    # Initialize the LabelEncoder object
    label_encoder = LabelEncoder()
    # Label encode the columns
    data[columns_to_label_encode] = data[columns_to_label_encode].apply(label_encoder.fit_transform)

    # Log a message that the label encoding process has ended
    logging.info("Label encoding process completed")

    # Return the data with the label encoded columns
    return data

def one_hot_encode(data, columns_to_one_hot_encode):
    """
    One hot encodes the columns in the data, using the get_dummies method.
    One hot encoding is used for columns that are nominal, meaning the data has no meaningful order.

    @param data (pd.DataFrame): The data to one hot encode
    @param columns_to_one_hot_encode (list): The columns to one hot encode
    @return pd.DataFrame: The data with the one hot encoded columns
    """
    # Log a message that the one hot encoding process has started
    logging.info("Starting one hot encoding process")

    # One hot encode the columns
    data = pd.get_dummies(data, columns=columns_to_one_hot_encode, dtype=int)

    # Log a message that the one hot encoding process has ended
    logging.info("One hot encoding process completed")

    # Return the data with the one hot encoded columns
    return data

def transform(data, config_file):
    """
    Transforms the data by normalizing, label encoding, and one hot encoding the columns in the data.
    The columns to normalize, label encode, and one hot encode are loaded from the configuration file.

    @param data (pd.DataFrame): The data to transform
    @param config_file (str): The path to the configuration file
    @return pd.DataFrame: The transformed data
    """
    # Log a message that the transform process has started
    logging.info("Starting data transform process")

    # Load the YAML configuration, which contains the columns
    try:
        with open(config_file, 'r') as file:
            config_file = yaml.safe_load(file)
        # Log a message that the configuration file was loaded successfully
        logging.info("Configuration file loaded successfully")
        
        # Access the columns from the configuration file
        columns_to_change_from_object_to_numeric = config_file['columns']['columns_to_change_from_object_to_numeric']
        columns_one_hot_encode = config_file['columns']['columns_one_hot_encode']
        columns_label_encode = config_file['columns']['columns_label_encode']
        columns_to_normalize = config_file['columns']['columns_to_normalize']
        # Columns that depend on another column, these columns will be label encoded
        columns_that_depend_on_another_column = config_file['columns']['columns_that_depend_on_another_column']

        # Log a message that the columns from the configuration file were loaded successfully
        logging.info("Columns from configuration file loaded successfully")

    except FileNotFoundError:
        logging.error(f"Configuration file not found: {config_file}")
        raise FileNotFoundError(f"Configuration file not found: {config_file}")
    
    # Change the columns from object to numeric
    data[columns_to_change_from_object_to_numeric] = data[columns_to_change_from_object_to_numeric].apply(pd.to_numeric, errors='coerce')

    # Fill missing values with zero value for the columns that will be converted to numeric, since empty means no information about the customer, which means no charges has been made
    data[columns_to_change_from_object_to_numeric] = data[columns_to_change_from_object_to_numeric].fillna(0)

    # Normalize the columns
    data = normalize(data, columns_to_normalize)

    # Label encode independent columns
    data = label_encode(data, columns_label_encode)

    # One hot encode the columns
    data = one_hot_encode(data, columns_one_hot_encode)

    # Label encode columns that depend on another column
    data = label_encode(data, columns_that_depend_on_another_column)
    
    # Log a message that the transform process has ended
    logging.info("Data transform process completed")

    # Return the transformed data
    return data

In [90]:
trans_data = transform(data, '/Users/akram/DataScienceProjects/customer-churn-prediction/configuration/etl/data_columns.yaml')

In [92]:
#Occurances of each value in the 'InternetService' column
data['TotalCharges'].unique()

array([0.0012751 , 0.21586661, 0.01031041, ..., 0.03780868, 0.03321025,
       0.78764136])

In [93]:
trans_data['OnlineBackup'].unique()

array([2, 0, 1])

In [94]:
trans_data.head(7)

Unnamed: 0,customerID,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,0,0.013889,0,1,0,2,0,0,0,0,0,1,0.115423,0.001275,0,1,0,1,0,0,0,0,1,0
1,5575-GNVDE,0,0,0,0.472222,1,0,2,0,2,0,0,0,1,0,0.385075,0.215867,0,0,1,1,0,0,0,0,0,1
2,3668-QPYBK,0,0,0,0.027778,1,0,2,2,0,0,0,0,0,1,0.354229,0.01031,1,0,1,1,0,0,0,0,0,1
3,7795-CFOCW,0,0,0,0.625,0,1,2,0,2,2,0,0,1,0,0.239303,0.210241,0,0,1,1,0,0,1,0,0,0
4,9237-HQITU,0,0,0,0.027778,1,0,0,0,0,0,0,0,0,1,0.521891,0.01533,1,1,0,0,1,0,0,0,1,0
5,9305-CDSKC,0,0,0,0.111111,1,2,0,0,2,0,2,2,0,1,0.80995,0.092511,1,1,0,0,1,0,0,0,1,0
6,1452-KIOVK,0,0,1,0.305556,1,2,0,2,0,0,2,0,0,1,0.704975,0.222779,0,0,1,0,1,0,0,1,0,0


In [95]:
trans_data.describe()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,0.483033,0.299588,0.449599,0.903166,0.940508,0.790004,0.906432,0.904444,0.797104,0.985376,0.992475,0.690473,0.592219,0.462803,0.261149,0.26537,0.495244,0.504756,0.343746,0.439585,0.216669,0.219225,0.216101,0.335794,0.22888
std,0.368612,0.499748,0.45811,0.341104,0.295752,0.948554,0.859848,0.880162,0.879949,0.861551,0.885002,0.885091,0.833755,0.491457,0.299403,0.261397,0.441561,0.500013,0.500013,0.474991,0.496372,0.412004,0.413751,0.411613,0.472301,0.420141
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.125,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171642,0.044245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.402778,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.518408,0.15909,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,0.763889,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,0.712438,0.43478,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
