In [159]:
## Import packages
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [160]:
## Data Importing functions

def import_client(filepath : str = "Data/client.csv") -> pd.DataFrame:
    """
    Function to import client dataset.
    """
    return pd.read_csv(filepath)

def import_invoice(filepath : str = "Data/invoice.csv") -> pd.DataFrame:
    """
    Function to import invoice dataset.
    """
    return pd.read_csv(filepath)

In [161]:
## Data Preprocessing functions

def convert_date(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts date column to an integer representing 
    the number of days since the earliest date.
    *Column name 'date' is fixed for both datasets.
    """

    # Convert 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Find the earliest date
    earliest_date = df['date'].min()
    
    # Calculate the number of days since the earliest date
    df['date'] = (df['date'] - earliest_date).dt.days
    
    return df

def drop_duplicates(df : pd.DataFrame) -> pd.DataFrame:
    """
    Prints the result of a duplicate check.
    Drops duplicates if they exist.
    """
    if df.duplicated().any(): # Duplicates check
        print("Duplicates found! Cleaning them up...")
        df = df.drop_duplicates() # Drops duplicates
        df = df.reset_index(drop = True) # Resets indexes
    else:
        print("No duplicates found!")
    return df

def convert_to_categorical(
        df : pd.DataFrame,
        cols : list[str]
        ) -> pd.DataFrame:
    """
    Converts list of column names to categorical datatype.
    """
    df[cols] = df[cols].astype('category')
    return df


In [162]:
## Data Feature Engineering functions

def aggregate_invoice(df : pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate the invoice dataframe by id and generate features.
    Calculates sum, mean, max, and std for each consommation_level and date
    and counts number of invoices under each id.
    """
    df = df.groupby('id').agg({ # Aggregate by id
        # Calculate sum, mean, max, and std for each consm_level and date
        'consommation_level_1': ['sum', 'mean', 'max', 'std'],
        'consommation_level_2': ['sum', 'mean', 'max', 'std'],
        'consommation_level_3': ['sum', 'mean', 'max', 'std'],
        'consommation_level_4': ['sum', 'mean', 'max', 'std'],
        'date': ['sum', 'mean', 'max', 'std'],
        'counter_statue': 'count', # Count number of invoices
    }).reset_index()
    df.columns = [
        # Join by _ if more than 2 parts to the column name exists
        '_'.join(col).strip() if col[1] 
        else col[0] # Else use original name
        for col in df.columns.values # For each column name value
    ]
    return df

def manual_fix_names(
        df : pd.DataFrame,
        new_col_names : list[str]
        ) -> pd.DataFrame:
    """
    Manually sets the column names of a dataframe.
    """
    df.columns = new_col_names
    return df


In [163]:
## Data Joining functions

def merge(
        client_df : pd.DataFrame,
        invoice_df : pd.DataFrame,
        merge_by : str = "id"
        ) -> pd.DataFrame:
    """
    Merges two dataframes.
    Merges on 'id' column by default (for client and invoice).
    """
    merged_df = pd.merge(
        client_df, invoice_df, on = merge_by)
    return merged_df


In [164]:
## Dimensionality Reduction functions

def prep_dataframe(
        df : pd.DataFrame, 
        response_col_name : str,
        cat_col_names : list[str] = [],
        scale : bool = True,
        OHE : bool = True
        ) -> pd.DataFrame:
    """
    For scaling and one-hot encoding dataframe.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        response_col_name (str): Name of response column
        cat_col_names (list[str]): List of categorical column names to evaluate.
        scale (bool): Indicate if data should be scaled
        OHE (bool): Indicate if categorical variables should undergo One-Hot Encoding

    Returns:
        pd.DataFrame: A DataFrame with columns scaled/One-Hot Encoded
    """

    df = df.drop(columns = ['id']) 
    # Drop id column as it is not useful for predicting

    cat_col_names.remove('id') 
    # Remove 'id' as it no longer exists in dataframe

    if response_col_name in cat_col_names:
        cat_col_names.remove(response_col_name)
    # Exclude doing OHE on response column

    # Identify numerical columns 
    # by excluding categorical and response columns
    num_col_names = [
        col for col in df.columns 
        if col not in cat_col_names 
        and col != response_col_name
    ]

    y = df[response_col_name].values # Response column
    X_num = df[num_col_names].values # Numerical columns
    X_cat = df[cat_col_names].values # Categorical columns

    if scale:
        # Scale numerical features
        scaler = StandardScaler()
        X_num = scaler.fit_transform(X_num)

    if OHE:
        # One-hot encode categorical features
        encoder = OneHotEncoder(sparse_output = False, drop = 'first')  
        # drop = 'first' to avoid dummy variable trap
        X_cat = encoder.fit_transform(X_cat)
    
    # Combine the scaled numerical and encoded categorical features
    X_prep = pd.DataFrame(
        data = np.hstack((X_num, X_cat)),  # Horizontal stack to combine arrays
        columns = num_col_names + list(encoder.get_feature_names_out(cat_col_names))
    )

    # Create a new DataFrame including the response variable
    prep_df = pd.DataFrame(X_prep, columns = X_prep.columns)
    prep_df[response_col_name] = y

    return prep_df

def principal_component_analysis(
        df : pd.DataFrame, 
        response_col_name : str,
        cat_col_names : list[str] = [], 
        var : float = 0.95, 
        logs : bool = False
        ) -> pd.DataFrame:
    """
    For reducing the dimensions of a dataframe using PCA.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        response_col_name (str): Name of response column
        cat_col_names (list[str]): List of categorical column names to evaluate.
        var (float): Proportion of variance that should be preserved
        logs (bool): Indicate if logs should be printed

    Returns:
        pd.DataFrame: A DataFrame that has undergone PCA
    """

    if response_col_name in cat_col_names:
        cat_col_names.remove(response_col_name)
    # Exclude doing OHE on response column

    # Identify numerical columns 
    # by excluding categorical and response columns
    num_col_names = [
        col for col in df.columns 
        if col not in cat_col_names 
        and col != response_col_name
    ]

    y = df[response_col_name].values # Response column
    X_num = df[num_col_names].values # Numerical columns
    X_cat = df[cat_col_names].values # Categorical columns

    pca = PCA(n_components = var) 
    # Keep 'var' proportion of the variance : default 95%
    X_pca = pca.fit_transform(X_num) 

    pca_columns = [f'PC{i+1}' for i in range(X_pca.shape[1])]
    df_pca = pd.DataFrame(X_pca, columns = pca_columns)

    df_modified = pd.concat(
        [
            df_pca, 
            pd.DataFrame(X_cat, columns = cat_col_names), 
            pd.Series(y, name = response_col_name)
        ], 
        axis = 1)
    
    if logs:
        print(f"Number of components selected: {pca.n_components_}")
        print("Explained variance ratio for each component:", pca.explained_variance_ratio_)
        print("Cumulative explained variance:", pca.explained_variance_ratio_.cumsum())
        print("Final DataFrame with PCA applied to numeric columns:")
        print(df_modified.head())

    return df_modified

def filter_low_variance(
        df: pd.DataFrame, 
        num_col_names: list[str], 
        threshold: float = 0.1
        ) -> pd.DataFrame:
    """
    Filter numeric columns based on a variance threshold.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        num_col_names (list[str]): List of numeric column names to evaluate.
        threshold (float): The variance threshold for filtering columns.

    Returns:
        pd.DataFrame: A DataFrame with low variance numeric columns removed.
    """
    
    # Calculate the variance for each numeric column
    variances = df[num_col_names].var()

    # Filter columns with variance greater than the threshold : default 0.1
    high_variance_cols = variances[variances > threshold].index.tolist()

    # Create a new DataFrame with only the selected columns
    filtered_df = df[high_variance_cols]

    return filtered_df


In [165]:
## Define main workflow

def main(): 
    client_df = import_client()
    invoice_df = import_invoice()
    client_df = convert_date(client_df) # Convert date cols
    invoice_df = convert_date(invoice_df)
    client_df = drop_duplicates(client_df) # Drop duplicates rows
    invoice_df = drop_duplicates(invoice_df)
    categorical_column_names = ['region', 'dis', 'id', 'catg', 'target']
    client_df = convert_to_categorical( # Convert categorical cols
        client_df, cols = categorical_column_names
        )
    invoice_df = aggregate_invoice(invoice_df) # Aggregate invoices
    invoice_df = manual_fix_names( # Fix column names manually
        invoice_df, 
        new_col_names = [
            'id', 
            'cons_level_1_sum', 'cons_level_1_mean', 
            'cons_level_1_max', 'cons_level_1_std',
            'cons_level_2_sum', 'cons_level_2_mean', 
            'cons_level_2_max', 'cons_level_2_std',
            'cons_level_3_sum', 'cons_level_3_mean', 
            'cons_level_3_max', 'cons_level_3_std',
            'cons_level_4_sum', 'cons_level_4_mean', 
            'cons_level_4_max', 'cons_level_4_std',
            'date_sum', 'date_mean', 'date_max', 'date_std',
            'num_invoices'
            ]
        )
    df = merge(client_df = client_df, invoice_df = invoice_df)
    df = prep_dataframe(
        df = df,
        response_col_name = 'target',
        cat_col_names = categorical_column_names
    )
    print(df.columns.values)
    df = principal_component_analysis(
        df = df,
        response_col_name = 'target'
    )
    print(df.columns.values)
    print(df.head())



In [166]:
## Run to execute main workflow

main()

  df['date'] = pd.to_datetime(df['date'])
  df['date'] = pd.to_datetime(df['date'])


No duplicates found!
Duplicates found! Cleaning them up...
['date' 'cons_level_1_sum' 'cons_level_1_mean' 'cons_level_1_max'
 'cons_level_1_std' 'cons_level_2_sum' 'cons_level_2_mean'
 'cons_level_2_max' 'cons_level_2_std' 'cons_level_3_sum'
 'cons_level_3_mean' 'cons_level_3_max' 'cons_level_3_std'
 'cons_level_4_sum' 'cons_level_4_mean' 'cons_level_4_max'
 'cons_level_4_std' 'date_sum' 'date_mean' 'date_max' 'date_std'
 'num_invoices' 'region_103' 'region_104' 'region_105' 'region_106'
 'region_107' 'region_206' 'region_301' 'region_302' 'region_303'
 'region_304' 'region_305' 'region_306' 'region_307' 'region_308'
 'region_309' 'region_310' 'region_311' 'region_312' 'region_313'
 'region_371' 'region_372' 'region_379' 'region_399' 'dis_62' 'dis_63'
 'dis_69' 'catg_12' 'catg_51' 'target']
['PC1' 'PC2' 'PC3' 'PC4' 'PC5' 'PC6' 'PC7' 'PC8' 'PC9' 'PC10' 'PC11'
 'PC12' 'PC13' 'PC14' 'PC15' 'PC16' 'target']
        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0 -0.156