In [36]:
## Import packages
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [37]:
## Data Importing functions

def import_client(filepath : str = "Data/client.csv") -> pd.DataFrame:
    """
    Function to import client dataset.
    """
    return pd.read_csv(filepath)

def import_invoice(filepath : str = "Data/invoice.csv") -> pd.DataFrame:
    """
    Function to import invoice dataset.
    """
    return pd.read_csv(filepath)

In [38]:
## Data Preprocessing functions

def convert_to_dates(df : pd.DataFrame) -> pd.DataFrame:
    """
    Converts date column to datetime object.
    *Column name 'date' is fixed for both datasets.
    """
    df['date'] = pd.to_datetime(df['date'])
    return df

def drop_duplicates(df : pd.DataFrame) -> pd.DataFrame:
    """
    Prints the result of a duplicate check.
    Drops duplicates if they exist.
    """
    if df.duplicated().any(): # Duplicates check
        print("Duplicates found! Cleaning them up...")
        df = df.drop_duplicates() # Drops duplicates
        df = df.reset_index(drop = True) # Resets indexes
    else:
        print("No duplicates found!")
    return df

def convert_to_categorical(
        df : pd.DataFrame,
        cols : list[str]
        ) -> pd.DataFrame:
    """
    Converts list of column names to categorical datatype.
    """
    df[cols] = df[cols].astype('category')
    return df


In [39]:
## Data Feature Engineering functions

def aggregate_invoice(df : pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate the invoice dataframe by id and generate features.
    Calculates sum, mean, max, and std for each consommation_level
    and counts number of invoices under each id.
    """
    df = df.groupby('id').agg({ # Aggregate by id
        # Calculate sum, mean, max, and std for each consm_level
        'consommation_level_1': ['sum', 'mean', 'max', 'std'],
        'consommation_level_2': ['sum', 'mean', 'max', 'std'],
        'consommation_level_3': ['sum', 'mean', 'max', 'std'],
        'consommation_level_4': ['sum', 'mean', 'max', 'std'],
        'counter_statue': 'count', # Count number of invoices
    }).reset_index()
    df.columns = [
        # Join by _ if more than 2 parts to the column name exists
        '_'.join(col).strip() if col[1] 
        else col[0] # Else use original name
        for col in df.columns.values # For each column name value
    ]
    return df

def manual_fix_names(
        df : pd.DataFrame,
        new_col_names : list[str]
        ) -> pd.DataFrame:
    """
    Manually sets the column names of a dataframe.
    """
    df.columns = new_col_names
    return df


In [40]:
## Data Joining functions

def merge(
        client_df : pd.DataFrame,
        invoice_df : pd.DataFrame,
        merge_by : str = "id"
        ) -> pd.DataFrame:
    """
    Merges two dataframes.
    Merges on 'id' column by default (for client and invoice).
    """
    merged_df = pd.merge(
        client_df, invoice_df, on = merge_by)
    return merged_df


In [41]:
## Define main workflow

def main(): 
    client_df = import_client()
    invoice_df = import_invoice()
    client_df = convert_to_dates(client_df) # Convert date cols
    invoice_df = convert_to_dates(invoice_df)
    client_df = drop_duplicates(client_df) # Drop duplicates rows
    invoice_df = drop_duplicates(invoice_df)
    client_df = convert_to_categorical( # Convert categorical cols
        client_df, cols = ['region', 'dis', 'id', 'catg', 'target']
        )
    invoice_df = aggregate_invoice(invoice_df) # Aggregate invoices
    invoice_df = manual_fix_names( # Fix column names manually
        invoice_df, 
        new_col_names = [
            'id', 
            'cons_level_1_sum', 'cons_level_1_mean', 
            'cons_level_1_max', 'cons_level_1_std',
            'cons_level_2_sum', 'cons_level_2_mean', 
            'cons_level_2_max', 'cons_level_2_std',
            'cons_level_3_sum', 'cons_level_3_mean', 
            'cons_level_3_max', 'cons_level_3_std',
            'cons_level_4_sum', 'cons_level_4_mean', 
            'cons_level_4_max', 'cons_level_4_std',
            'num_invoices'
            ]
        )
    merged_df = merge(client_df = client_df, invoice_df = invoice_df)
    print(merged_df.head())


In [42]:
## Run to execute main workflow

main()

  df['date'] = pd.to_datetime(df['date'])
  df['date'] = pd.to_datetime(df['date'])


No duplicates found!
Duplicates found! Cleaning them up...
  region       date dis    id catg target  cons_level_1_sum  \
0    101 1994-12-31  60     0   11      0             12334   
1    107 2002-05-29  69     1   11      0             20629   
2    301 1986-03-13  62    10   11      0             14375   
3    105 1996-07-11  69   100   11      0                24   
4    303 2014-10-14  62  1000   11      0              9292   

   cons_level_1_mean  cons_level_1_max  cons_level_1_std  ...  \
0         352.400000              1200        310.343472  ...   
1         557.540541              1207        197.935960  ...   
2         798.611111              2400        513.841374  ...   
3           1.200000                15          3.607011  ...   
4         663.714286               800        224.831365  ...   

   cons_level_2_std  cons_level_3_sum  cons_level_3_mean  cons_level_3_max  \
0         43.568935                 0           0.000000                 0   
1          0.00