In [1]:
## Import packages
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
## Data Importing functions

def import_client(filepath : str = "Data/client.csv") -> pd.DataFrame:
    """
    Function to import client dataset.
    """
    return pd.read_csv(filepath)

def import_invoice(filepath : str = "Data/invoice.csv") -> pd.DataFrame:
    """
    Function to import invoice dataset.
    """
    return pd.read_csv(filepath)

In [None]:
## Data Preprocessing functions

def convert_to_dates(df : pd.DataFrame) -> pd.DataFrame:
    """
    Converts date column to datetime object.
    *Column name 'date' is fixed for both datasets.
    """
    df['date'] = pd.to_datetime(df['date'])
    return df

def drop_duplicates(df : pd.DataFrame) -> pd.DataFrame:
    """
    Prints the result of a duplicate check.
    Drops duplicates if they exist.
    """
    if df.duplicated().any(): # Duplicates check
        print("Duplicates found! Cleaning them up...")
        df = df.drop_duplicates() # Drops duplicates
        df = df.reset_index(drop = True) # Resets indexes
    else:
        print("No duplicates found!")
    return df

def convert_to_categorical(
        df : pd.DataFrame,
        cols : list[str]
        ) -> pd.DataFrame:
    """
    Converts list of column names to categorical datatype.
    """
    df[cols] = df[cols].astype('category')
    return df


In [None]:
## Data Feature Engineering functions

def aggregate_invoice(df : pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate the invoice dataframe by id and generate features.
    Calculates sum, mean, max, and std for each consommation_level
    and counts number of invoices under each id.
    """
    df = df.groupby('id').agg({ # Aggregate by id
        # Calculate sum, mean, max, and std for each consm_level
        'consommation_level_1': ['sum', 'mean', 'max', 'std'],
        'consommation_level_2': ['sum', 'mean', 'max', 'std'],
        'consommation_level_3': ['sum', 'mean', 'max', 'std'],
        'consommation_level_4': ['sum', 'mean', 'max', 'std'],
        'counter_statue': 'count', # Count number of invoices
    }).reset_index()
    df.columns = [
        # Join by _ if more than 2 parts to the column name exists
        '_'.join(col).strip() if col[1] 
        else col[0] # Else use original name
        for col in df.columns.values # For each column name value
    ]
    return df

def manual_fix_names(
        df : pd.DataFrame,
        new_col_names : list[str]
        ) -> pd.DataFrame:
    """
    Manually sets the column names of a dataframe.
    """
    df.columns = new_col_names
    return df


In [None]:
## Data Joining functions

def merge(
        client_df : pd.DataFrame,
        invoice_df : pd.DataFrame,
        merge_by : str = "id"
        ) -> pd.DataFrame:
    """
    Merges two dataframes.
    Merges on 'id' column by default (for client and invoice).
    """
    merged_df = pd.merge(
        client_df, invoice_df, on = merge_by)
    return merged_df
